aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm')
-rw-r--r--src/core/NEON/kernels/arm_gemm/barrier.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/convolver.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp99
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp60
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp109
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp84
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp68
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp133
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_q8_mixed.cpp138
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp117
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp93
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp42
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_u8s8fp32.cpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp68
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_batched.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp64
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp166
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp150
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp108
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp108
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp216
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp130
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp194
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp52
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp344
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp322
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp66
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp160
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp138
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp134
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp158
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp56
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp152
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp190
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp152
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp190
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp134
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp142
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp343
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp343
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp356
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp332
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp356
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp332
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp361
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp116
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp36
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp148
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp148
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp241
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp590
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp564
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp478
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp1815
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp2455
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp1495
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp1179
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp11
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp26
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp672
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp40
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp206
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp206
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp402
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp40
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp206
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp206
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp468
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp484
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp486
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp1487
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp1787
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp3283
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp2437
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp1387
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp981
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp2015
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp1477
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp1591
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp873
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp1111
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp1617
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp1527
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp401
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp641
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp2473
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp1799
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp2161
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp1627
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp177
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp535
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp1527
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp401
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp641
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16.hpp103
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp2027
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16.hpp101
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp2099
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16.hpp116
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16/generic.cpp3264
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16.hpp112
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16/generic.cpp3450
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp1627
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp177
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp535
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp50
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp50
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12.hpp107
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12/generic.cpp294
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp50
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp496
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp502
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp436
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp468
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp470
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp540
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp620
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp558
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL/generic.cpp776
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp558
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp610
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp768
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp768
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp242
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp312
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp392
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp209
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp450
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp348
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp250
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp318
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp398
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp425
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp364
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp224
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp250
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp282
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp425
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp364
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp1139
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp441
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp1075
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp441
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp763
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp749
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL.hpp110
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL/generic.cpp274
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp200
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp106
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp32
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp106
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp32
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp583
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp1021
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp335
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp753
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp335
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp583
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp381
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp605
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp627
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp1021
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp605
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp705
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp1301
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp1583
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp137
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp385
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp921
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp605
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp705
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL.hpp99
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp1502
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL.hpp99
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp1418
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL.hpp112
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL/generic.cpp1675
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp137
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp385
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp921
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp200
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp112
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp112
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp112
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp200
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL.hpp107
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL/generic.cpp297
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp112
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp200
-rw-r--r--src/core/NEON/kernels/arm_gemm/performance_parameters.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantized.cpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp32
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp332
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp599
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp433
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp459
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp652
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp1162
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp288
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp288
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp402
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp349
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp296
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp792
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp600
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp266
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp1266
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp345
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp345
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp760
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp618
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp207
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp370
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp379
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp226
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp294
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp169
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp160
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp156
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp56
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp76
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp109
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp180
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp70
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp68
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp114
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp206
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp105
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp58
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp100
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp148
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp90
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp90
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp30
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp582
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp348
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp116
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp458
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp342
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp142
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp354
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp390
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp368
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp498
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp302
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp388
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp362
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp368
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp298
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp500
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp628
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp380
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp58
392 files changed, 55781 insertions, 74625 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/barrier.hpp b/src/core/NEON/kernels/arm_gemm/barrier.hpp
index 3b34e8089f..b7abd87c29 100644
--- a/src/core/NEON/kernels/arm_gemm/barrier.hpp
+++ b/src/core/NEON/kernels/arm_gemm/barrier.hpp
@@ -39,14 +39,6 @@ private:
public:
barrier(unsigned int threads) : m_threads(threads), m_waiters(0), m_leavers(0) { }
- // Add a move constructor because these objects might be moved around at setup time.
- // Moving while the barrier is active won't work.
- barrier(barrier &&other) : m_threads(other.m_threads), m_waiters(0), m_leavers(0) {
- // This doesn't make it safe, but will have a chance of firing if something odd is occurring.
- assert(other.m_waiters==0);
- assert(other.m_leavers==0);
- }
-
/* This isn't safe if any thread is waiting... */
void set_nthreads(unsigned int nthreads) {
m_threads = nthreads;
diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp
index a9f3dd0ad8..b15f669132 100644
--- a/src/core/NEON/kernels/arm_gemm/convolver.hpp
+++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp
@@ -231,8 +231,8 @@ public:
for (unsigned int ky=0; ky<params.kernel_height; ky++) {
for (unsigned int kx=0; kx<params.kernel_width; kx++) {
unsigned int n = (ky * params.kernel_width) + kx;
- m_kernel_y[n] = (ky * params.dilation_h) - params.padding_top;
- m_kernel_x[n] = (kx * params.dilation_w) - params.padding_left;
+ m_kernel_y[n] = ky - params.padding_top;
+ m_kernel_x[n] = kx - params.padding_left;
}
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 7d746789e4..c8bd8fd658 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -46,7 +46,6 @@
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
#include "kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp"
-#include "kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL.hpp"
#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
@@ -68,10 +67,10 @@
namespace arm_gemm {
-static const GemmImplementation<bfloat16, bfloat16, float> gemm_bf16_methods[] =
+static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
{
-#ifdef ARM_COMPUTE_ENABLE_BF16
#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_BF16
#ifdef ARM_COMPUTE_ENABLE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
// SME kernels
@@ -107,36 +106,36 @@ static const GemmImplementation<bfloat16, bfloat16, float> gemm_bf16_methods[] =
},
#endif // ARM_COMPUTE_ENABLE_SME2
// gemm_bf16_interleaved
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_bf16fp32_mmla_8x3VL",
[](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>4); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_bf16fp32_mmla_6x4VL",
[](const GemmArgs &args) { return args._ci->has_svebf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_bf16fp32_dot_6x4VL",
[](const GemmArgs &args) { return args._ci->has_svebf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_bf16fp32_dot_8x3VL",
[](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>2); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
),
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_ffinterleaved_bf16fp32_mmla_8x3VL",
KernelWeightFormat::VL2VL_BL64,
@@ -144,15 +143,7 @@ GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "sve_ffinterleaved_bf16fp32_dot_8x3VL",
- KernelWeightFormat::VL1VL_BL32,
- [](const GemmArgs &args) { return args._ci->has_svebf16(); },
- [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_dot_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
-),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_ffhybrid_bf16fp32_mmla_6x4VL",
KernelWeightFormat::VL2VL_BL64,
@@ -162,36 +153,36 @@ GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
),
#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_bf16fp32_mmla_6x16",
[](const GemmArgs &args) { return args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_bf16fp32_mmla_8x12",
[](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>4); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_bf16fp32_dot_6x16",
[](const GemmArgs &args) { return args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_bf16fp32_dot_8x12",
[](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>2); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
),
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffinterleaved_bf16fp32_mmla_8x12",
KernelWeightFormat::VL256_BL64,
@@ -199,7 +190,7 @@ GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffhybrid_bf16fp32_mmla_6x16",
KernelWeightFormat::VL256_BL64,
@@ -207,7 +198,7 @@ GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
[](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
[](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
),
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffinterleaved_bf16fp32_dot_8x12",
KernelWeightFormat::VL128_BL32,
@@ -216,25 +207,15 @@ GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
),
#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
-GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
+GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_sgemm_8x12",
nullptr,
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>(args); }
),
-#elif defined(__arm__)
-{
- GemmMethod::GEMM_INTERLEAVED,
- "sgemm_8x6",
- nullptr,
- nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, bfloat16, bfloat16, float>(args); }
-},
-#else
-# error "Unknown Architecture"
-#endif
#endif // ARM_COMPUTE_ENABLE_BF16
+#endif // __aarch64__
{
GemmMethod::DEFAULT,
"",
@@ -245,14 +226,14 @@ GemmImplementation<bfloat16, bfloat16, float>::with_estimate(
};
template<>
-const GemmImplementation<bfloat16, bfloat16, float> *gemm_implementation_list<bfloat16, bfloat16, float>() {
+const GemmImplementation<bfloat16, float> *gemm_implementation_list<bfloat16, float>() {
return gemm_bf16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<bfloat16, bfloat16, float> gemm<bfloat16, bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<bfloat16, bfloat16, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<bfloat16, bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<bfloat16, bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<bfloat16, float> gemm<bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<bfloat16, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
index 1e4de4a39e..aa761b46e4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
@@ -32,12 +32,12 @@
namespace arm_gemm {
-static const GemmImplementation<bfloat16, bfloat16, bfloat16> gemm_bf16bf16_methods[] =
+static const GemmImplementation<bfloat16, bfloat16> gemm_bf16bf16_methods[] =
{
#ifdef __aarch64__
#ifdef ARM_COMPUTE_ENABLE_BF16
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
-GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
+GemmImplementation<bfloat16, bfloat16>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffinterleaved_bf16fp32_mmla_8x12",
KernelWeightFormat::VL256_BL64,
@@ -45,7 +45,7 @@ GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16>::estimate_cycles<bfloat16>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16>(args); }
),
-GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
+GemmImplementation<bfloat16, bfloat16>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_ffinterleaved_bf16fp32_mmla_8x3VL",
KernelWeightFormat::VL2VL_BL64,
@@ -66,14 +66,14 @@ GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
};
template<>
-const GemmImplementation<bfloat16, bfloat16, bfloat16> *gemm_implementation_list<bfloat16, bfloat16, bfloat16>() {
+const GemmImplementation<bfloat16, bfloat16> *gemm_implementation_list<bfloat16, bfloat16>() {
return gemm_bf16bf16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<bfloat16, bfloat16, bfloat16> gemm<bfloat16, bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<bfloat16, bfloat16, bfloat16, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<bfloat16, bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<bfloat16, bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<bfloat16, bfloat16> gemm<bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<bfloat16, bfloat16, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 0d9f53b84d..12bddf15e1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -23,7 +23,7 @@
*/
// This can only be built if the target/compiler supports FP16 arguments.
-#if defined(__aarch64__) && (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include "arm_gemm.hpp"
@@ -42,10 +42,12 @@
#include "kernels/a64_hgemm_8x24.hpp"
#include "kernels/a64_hybrid_fp16_mla_6x32.hpp"
#include "kernels/a64_sgemm_8x12.hpp"
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp"
#include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp"
#include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp"
#include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
#include "kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp"
#include "kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp"
@@ -55,7 +57,7 @@
namespace arm_gemm {
-static const GemmImplementation<__fp16, __fp16, __fp16> gemm_fp16_methods[] = {
+static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
#ifdef ARM_COMPUTE_ENABLE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
{
@@ -71,7 +73,7 @@ static const GemmImplementation<__fp16, __fp16, __fp16> gemm_fp16_methods[] = {
[](const GemmArgs &args) { return args._ci->has_sme2(); },
[](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL, __fp16, __fp16, __fp16, Nothing, false, false, false, true>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL, __fp16, __fp16, Nothing, false, false, false, true>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
@@ -79,32 +81,32 @@ static const GemmImplementation<__fp16, __fp16, __fp16> gemm_fp16_methods[] = {
[](const GemmArgs &args) { return args._ci->has_sme2(); },
[](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL, __fp16, __fp16, __fp16, Nothing, false, false, false, true>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL, __fp16, __fp16, Nothing, false, false, false, true>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL",
[](const GemmArgs &args) { return args._ci->has_sme2(); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL, __fp16, __fp16, __fp16, Nothing, false, false, false, true>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL, __fp16, __fp16, Nothing, false, false, false, true>(args); }
},
#endif // ARM_COMPUTE_ENABLE_SME2
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_fp16_mla_6x4VL",
[](const GemmArgs &args) { return args._ci->has_sve(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16>(args); }
),
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_fp16_mla_8x3VL",
[](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize > 4); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
),
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_ffinterleaved_fp16_mla_8x3VL",
KernelWeightFormat::VL1VL_BL16,
@@ -112,7 +114,7 @@ GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp16_mla_8x3VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
),
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_ffhybrid_fp16_mla_6x4VL",
KernelWeightFormat::VL1VL_BL16,
@@ -123,22 +125,22 @@ GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
#endif // ARM_COMPUTE_ENABLE_SVE
#if defined(__aarch64__)
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_fp16_mla_6x32",
[](const GemmArgs &args) { return args._ci->has_fp16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
),
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_hgemm_8x24",
[](const GemmArgs &args) { return args._ci->has_fp16(); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>(args); }
),
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffinterleaved_fp16_mla_8x24",
KernelWeightFormat::VL128_BL16,
@@ -146,7 +148,7 @@ GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp16_mla_8x24, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp16_mla_8x24, __fp16, __fp16>(args); }
),
-GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
+GemmImplementation<__fp16, __fp16>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_ffhybrid_fp16_mla_6x32",
KernelWeightFormat::VL128_BL16,
@@ -160,7 +162,7 @@ GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
"a64_sgemm_8x12",
nullptr,
[](const GemmArgs &args) { return !args._ci->has_fp16(); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, __fp16, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, __fp16, __fp16>(args); }
},
#elif defined(__arm__)
{
@@ -168,7 +170,7 @@ GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
"sgemm_8x6",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
},
#else // not AArch64 or AArch32
# error Unknown Architecture
@@ -183,16 +185,16 @@ GemmImplementation<__fp16, __fp16, __fp16>::with_estimate(
};
template<>
-const GemmImplementation<__fp16, __fp16, __fp16> *gemm_implementation_list<__fp16, __fp16, __fp16>() {
+const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp16>() {
return gemm_fp16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<__fp16, __fp16, __fp16> gemm<__fp16, __fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<__fp16, __fp16, __fp16, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<__fp16, __fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<__fp16, __fp16, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<__fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
-#endif // defined(__aarch64__) && (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+#endif // defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 5da7161671..0c1d3a387b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -82,7 +82,7 @@
namespace arm_gemm {
-static const GemmImplementation<float, float, float> gemm_fp32_methods[] =
+static const GemmImplementation<float, float> gemm_fp32_methods[] =
{
// GEMV cases - starting with 'gemv_batched' wrapper to turn batched GEMV into GEMM.
{
@@ -95,27 +95,27 @@ static const GemmImplementation<float, float, float> gemm_fp32_methods[] =
#ifdef __aarch64__
#ifdef ARM_COMPUTE_ENABLE_BF16
// "fast mode" (BF16) kernels
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_bf16fp32_mmla_8x12",
[](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_fp32bf16fp32_mmla_6x16",
[](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_fp32bf16fp32_mmla_4x24",
[](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
),
#endif // ARM_COMPUTE_ENABLE_BF16
#ifdef ARM_COMPUTE_ENABLE_SVE
@@ -189,26 +189,26 @@ GemmImplementation<float, float, float>::with_estimate(
},
#endif // ARM_COMPUTE_ENABLE_SME2
#ifdef ARM_COMPUTE_ENABLE_BF16
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_bf16fp32_mmla_8x3VL",
[](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_fp32bf16fp32_mmla_6x4VL",
[](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_fp32bf16fp32_mmla_4x6VL",
[](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
),
#endif // ARM_COMPUTE_ENABLE_BF16
#ifdef ARM_COMPUTE_ENABLE_SVEF32MM
@@ -218,8 +218,8 @@ GemmImplementation<float, float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_fp32_mmla_8x3VL",
[](const GemmArgs &args) { return args._ci->has_svef32mm() && (args._Ksize>4); },
- [](const GemmArgs &args) { return !(args._fast_mode && args._ci->has_svebf16()); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return !(args._fast_mode && args._ci->has_bf16()); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
},
#endif // ARM_COMPUTE_ENABLE_SVEF32MM
// SVE kernels
@@ -228,25 +228,25 @@ GemmImplementation<float, float, float>::with_estimate(
"sve_hybrid_fp32_mla_8x1VL",
[](const GemmArgs &args) { return args._ci->has_sve(); },
[](const GemmArgs &args) { return (args._Nsize < 12); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
},
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_fp32_mla_6x4VL",
[](const GemmArgs &args) { return args._ci->has_sve(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_fp32_mla_8x3VL",
[](const GemmArgs &args) { return args._ci->has_sve(); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
),
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
#ifdef ARM_COMPUTE_ENABLE_BF16
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_ffinterleaved_bf16fp32_mmla_8x3VL",
KernelWeightFormat::VL2VL_BL64_BF16,
@@ -254,7 +254,7 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_ffhybrid_fp32bf16fp32_mmla_4x6VL",
KernelWeightFormat::VL2VL_BL64_BF16,
@@ -263,7 +263,7 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
),
#endif
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_ffinterleaved_fp32_mla_8x3VL",
KernelWeightFormat::VL1VL_BL32,
@@ -271,7 +271,7 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp32_mla_8x3VL, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_ffhybrid_fp32_mla_6x4VL",
KernelWeightFormat::VL1VL_BL32,
@@ -287,7 +287,7 @@ GemmImplementation<float, float, float>::with_estimate(
"a64_sgemm_8x6",
nullptr,
[](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A35; },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x6, float, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x6, float, float>(args); }
},
// Arm® Neon™ hybrid methods
{
@@ -309,33 +309,33 @@ GemmImplementation<float, float, float>::with_estimate(
"a64_hybrid_fp32_mla_8x4",
nullptr,
[](const GemmArgs &args) { return (args._Nsize < 12); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_8x4, float, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_8x4, float, float>(args); }
},
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_fp32_mla_4x24",
nullptr,
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_fp32_mla_6x16",
nullptr,
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_sgemm_8x12",
nullptr,
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float, float>::estimate_cycles<float>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles<float>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
),
#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
#ifdef ARM_COMPUTE_ENABLE_BF16
// "fast mode" (BF16) kernels
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffinterleaved_bf16fp32_mmla_8x12",
KernelWeightFormat::VL256_BL64_BF16,
@@ -343,7 +343,7 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_ffhybrid_fp32bf16fp32_mmla_4x24",
KernelWeightFormat::VL256_BL64_BF16,
@@ -351,7 +351,7 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
[](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_ffhybrid_fp32bf16fp32_mmla_6x16",
KernelWeightFormat::VL256_BL64_BF16,
@@ -359,9 +359,8 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
[](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
),
-
#endif // BF16
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_ffinterleaved_fp32_mla_8x12",
KernelWeightFormat::VL128_BL32,
@@ -369,7 +368,7 @@ GemmImplementation<float, float, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp32_mla_8x12, float, float>::estimate_cycles<float>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp32_mla_8x12, float, float>(args); }
),
-GemmImplementation<float, float, float>::with_estimate(
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_ffhybrid_fp32_mla_6x16",
KernelWeightFormat::VL128_BL32,
@@ -386,7 +385,7 @@ GemmImplementation<float, float, float>::with_estimate(
"sgemm_8x6",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, float, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, float, float>(args); }
},
#endif // __arm__
{
@@ -400,14 +399,14 @@ GemmImplementation<float, float, float>::with_estimate(
/* Templated function to return this list. */
template<>
-const GemmImplementation<float, float, float> *gemm_implementation_list<float, float, float>() {
+const GemmImplementation<float, float> *gemm_implementation_list<float, float>() {
return gemm_fp32_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<float, float, float> gemm<float, float, float, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<float, float, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<float, float, float, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<float, float, float, Nothing> (const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<float, float> gemm<float, float, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<float, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<float, float, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<float, float, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 834751f1fe..a6c9677305 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -41,7 +41,7 @@ namespace arm_gemm {
// Implementation of the GemmCommon abstract class.
template<typename strategy, typename To, typename Tr>
-class GemmHybrid : public GemmCommon<To, To, Tr> {
+class GemmHybrid : public GemmCommon<To, Tr> {
typedef typename strategy::operand_type Toi;
typedef typename strategy::result_type Tri;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 8bbb877c1b..0cc4d4f3d9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -260,8 +260,8 @@ struct kernel_weight_format<strategy, false> {
} // anonymous namespace
// Implementation of the GemmCommon abstract class.
-template<typename strategy, typename To, typename Tw, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false>
-class GemmHybridIndirect : public GemmCommon<To, Tw, Tr> {
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false>
+class GemmHybridIndirect : public GemmCommon<To, Tr> {
typedef typename strategy::lhs_operand_type Tloi;
typedef typename strategy::rhs_operand_type Troi;
typedef typename strategy::result_type Tri;
@@ -618,7 +618,7 @@ public:
return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width());
}
- void requantize_bias(void *in_buffer, const Tw *B, const int ldb, const int B_multi_stride) override {
+ void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
if (std::is_same<OutputStage, Requantize32>::value) {
_col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -636,11 +636,11 @@ public:
return strat.transforms.PrepareB_supports_transpose();
}
- void pretranspose_B_array(void *in_buffer, const Tw *B, const int ldb, const int B_multi_stride, bool transposed) override {
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
}
- void pretranspose_B_array_part(void *in_buffer, const Tw *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override {
+ void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override {
if (end >= get_B_pretranspose_window_size()) {
requantize_bias(in_buffer, B, ldb, B_multi_stride);
}
@@ -835,7 +835,7 @@ public:
};
template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
-using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, To, Tr, OutputStage, false, true>;
+using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>;
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 62184bcbd1..f12efe4282 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -42,7 +42,7 @@ namespace arm_gemm {
// Implementation of the GemmCommon abstract class.
template<typename strategy, typename To, typename Tr>
-class GemmHybridQuantized : public GemmCommon<To, To, Tr> {
+class GemmHybridQuantized : public GemmCommon<To, Tr> {
typedef typename strategy::operand_type Toi;
typedef typename strategy::result_type Tri;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 19d5e3e23d..db5155f500 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -35,17 +35,17 @@ namespace arm_gemm {
* of types, a static list of these structures is built up to describe the
* implementations available.
*/
-template<typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing>
+template<typename Top, typename Tret, class OutputStage = Nothing>
struct GemmImplementation {
const GemmMethod method;
const char * name;
const KernelWeightFormat kernel_weight_format = KernelWeightFormat::NON_FIXED;
std::function<bool(const GemmArgs &, const OutputStage &)> is_supported = {};
std::function<uint64_t(const GemmArgs &, const OutputStage &)> cycle_estimate = {};
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &, const OutputStage &)> instantiate = {};
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate = {};
bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
- // Check supplied is_supported() function first.
+ // Check supplied is_supported() function first.
if (is_supported != nullptr && !is_supported(args, os)) {
return false;
}
@@ -68,7 +68,7 @@ struct GemmImplementation {
// If we get here it means there is a config and it specifies a format. Check it matches this kernel.
// NOTE: this will execute SVE instructions if it's an SVE kernel, so it's important that is_supported()
// was called above first.
- return (args._cfg->weight_format == get_weight_format(kernel_weight_format, sizeof(Tlop)));
+ return (args._cfg->weight_format == get_weight_format(kernel_weight_format, sizeof(Top)));
}
}
@@ -80,13 +80,13 @@ struct GemmImplementation {
}
}
- GemmCommon<Tlop, Trop, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
+ GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
return instantiate(args, os);
}
static GemmImplementation with_estimate(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<uint64_t(const GemmArgs &, const OutputStage &)> cycle_estimate,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) {
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) {
GemmImplementation impl(m,n);
impl.is_supported=is_supported;
@@ -103,14 +103,14 @@ struct GemmImplementation {
GemmImplementation(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
method(m), name(n), is_supported(is_supported),
cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
instantiate(instantiate) { }
GemmImplementation(GemmMethod m, const char *n, KernelWeightFormat kwf,
std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
method(m), name(n), kernel_weight_format(kwf), is_supported(is_supported),
cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
instantiate(instantiate) { }
@@ -119,17 +119,17 @@ struct GemmImplementation {
/* Slightly different version of above for straightforward GEMMs with no
* output stage, so the std::functions there don't have to deal with the
* unnecessary second argument. */
-template<typename Tlop, typename Trop, typename Tret>
-struct GemmImplementation<Tlop, Trop, Tret, Nothing> {
+template<typename Top, typename Tret>
+struct GemmImplementation<Top, Tret, Nothing> {
const GemmMethod method;
const char * name;
const KernelWeightFormat kernel_weight_format = KernelWeightFormat::NON_FIXED;
std::function<bool(const GemmArgs &)> is_supported = {};
std::function<uint64_t(const GemmArgs &)> cycle_estimate = {};
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &)> instantiate = {};
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate = {};
bool do_is_supported(const GemmArgs &args, const Nothing &) const {
- // Check supplied is_supported() function first.
+ // Check supplied is_supported() function first.
if (is_supported != nullptr && !is_supported(args)) {
return false;
}
@@ -152,7 +152,7 @@ struct GemmImplementation<Tlop, Trop, Tret, Nothing> {
// If we get here it means there is a config and it specifies a format. Check it matches this kernel.
// NOTE: this will execute SVE instructions if it's an SVE kernel, so it's important that is_supported()
// was called above first.
- return (args._cfg->weight_format == get_weight_format(kernel_weight_format, sizeof(Tlop)));
+ return (args._cfg->weight_format == get_weight_format(kernel_weight_format, sizeof(Top)));
}
}
@@ -164,13 +164,13 @@ struct GemmImplementation<Tlop, Trop, Tret, Nothing> {
}
}
- GemmCommon<Tlop, Trop, Tret> *do_instantiate(const GemmArgs &args, const Nothing &) const {
+ GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const Nothing &) const {
return instantiate(args);
}
static GemmImplementation with_estimate(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &)> instantiate) {
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
GemmImplementation impl(m,n);
impl.is_supported=is_supported;
@@ -182,7 +182,7 @@ struct GemmImplementation<Tlop, Trop, Tret, Nothing> {
static GemmImplementation with_estimate(GemmMethod m, const char *n, KernelWeightFormat f,
std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &)> instantiate) {
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
GemmImplementation impl(m,n,f);
impl.is_supported=is_supported;
@@ -199,14 +199,14 @@ struct GemmImplementation<Tlop, Trop, Tret, Nothing> {
GemmImplementation(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &)> instantiate) :
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) :
method(m), name(n), is_supported(is_supported),
cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
instantiate(instantiate) { }
GemmImplementation(GemmMethod m, const char *n, KernelWeightFormat kwf,
std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
- std::function<GemmCommon<Tlop, Trop, Tret> *(const GemmArgs &)> instantiate) :
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) :
method(m), name(n), kernel_weight_format(kwf), is_supported(is_supported),
cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
instantiate(instantiate) { }
@@ -218,8 +218,8 @@ struct GemmImplementation<Tlop, Trop, Tret, Nothing> {
* A specialised version is provided for each supported combination of types.
* The end of the list is indicated by a sentinel descriptor with
* method==GemmMethod::DEFAULT. */
-template<typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing>
-const GemmImplementation<Tlop, Trop, Tret, OutputStage> *gemm_implementation_list();
+template<typename Top, typename Tret, class OutputStage = Nothing>
+const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
/*
* Select a GEMM implementation for the given arguments.
@@ -234,15 +234,15 @@ const GemmImplementation<Tlop, Trop, Tret, OutputStage> *gemm_implementation_lis
* this function returns false and doesn't touch the provided pointer
* reference.
*/
-template<typename Tlop, typename Trop, typename Tret, class OutputStage>
-bool find_implementation(const GemmArgs &args, const OutputStage &os, const GemmImplementation<Tlop, Trop, Tret, OutputStage> * &impl) {
- auto gemms = gemm_implementation_list<Tlop, Trop, Tret, OutputStage>();
+template<typename Top, typename Tret, class OutputStage>
+bool find_implementation(const GemmArgs &args, const OutputStage &os, const GemmImplementation<Top, Tret, OutputStage> * &impl) {
+ auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
const GemmConfig *cfg = args._cfg;
- const GemmImplementation<Tlop, Trop, Tret, OutputStage> *saved_impl = nullptr;
+ const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
uint64_t best_estimate = 0;
- for (const GemmImplementation<Tlop, Trop, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
+ for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
/* Skip if this implementation doesn't support these args. */
if (!i->do_is_supported(args, os)) {
continue;
@@ -284,17 +284,17 @@ bool find_implementation(const GemmArgs &args, const OutputStage &os, const Gemm
return false;
}
-template<typename Tlop, typename Trop, typename Tret, class OutputStage>
+template<typename Top, typename Tret, class OutputStage>
std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage &os) {
std::vector<KernelDescription> res;
/* Find out what the default implementation in so we can set the flag accordingly later. */
- const GemmImplementation<Tlop, Trop, Tret, OutputStage> *default_impl;
+ const GemmImplementation<Top, Tret, OutputStage> *default_impl;
find_implementation(args, os, default_impl);
- auto gemms = gemm_implementation_list<Tlop, Trop, Tret, OutputStage>();
+ auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
- for (const GemmImplementation<Tlop, Trop, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
+ for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
/* Check that this implementation supports the presented problem. */
if (!i->do_is_supported(args, os)) {
@@ -307,31 +307,31 @@ std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, cons
return res;
}
-template<typename Tlop, typename Trop, typename Tret, class OutputStage>
+template<typename Top, typename Tret, class OutputStage>
bool has_opt_gemm(WeightFormat &wf, const GemmArgs &args, const OutputStage &os) {
- const GemmImplementation<Tlop, Trop, Tret, OutputStage> *impl;
- const bool success = find_implementation<Tlop, Trop, Tret, OutputStage>(args, os, impl);
+ const GemmImplementation<Top, Tret, OutputStage> *impl;
+ const bool success = find_implementation<Top, Tret, OutputStage>(args, os, impl);
if (success)
- wf = UniqueGemmCommon<Tlop, Trop, Tret>(impl->do_instantiate(args, os))->get_config().weight_format;
+ wf = UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os))->get_config().weight_format;
return success;
}
-template<typename Tlop, typename Trop, typename Tret, class OutputStage>
-UniqueGemmCommon<Tlop, Trop, Tret> gemm(const GemmArgs &args, const OutputStage &os) {
- const GemmImplementation<Tlop, Trop, Tret, OutputStage> *impl;
+template<typename Top, typename Tret, class OutputStage>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage &os) {
+ const GemmImplementation<Top, Tret, OutputStage> *impl;
- if (find_implementation<Tlop, Trop, Tret, OutputStage>(args, os, impl)) {
- return UniqueGemmCommon<Tlop, Trop, Tret>(impl->do_instantiate(args, os));
+ if (find_implementation<Top, Tret, OutputStage>(args, os, impl)) {
+ return UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os));
}
- return UniqueGemmCommon<Tlop, Trop, Tret>(nullptr);
+ return UniqueGemmCommon<Top, Tret>(nullptr);
}
-template<typename Tlop, typename Trop, typename Tret, class OutputStage>
+template<typename Top, typename Tret, class OutputStage>
KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage &os) {
- const GemmImplementation<Tlop, Trop, Tret, OutputStage> *impl;
+ const GemmImplementation<Top, Tret, OutputStage> *impl;
- if (find_implementation<Tlop, Trop, Tret>(args, os, impl)) {
+ if (find_implementation<Top, Tret>(args, os, impl)) {
return KernelDescription(impl->method, impl->name);
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index c44e7be4a3..befc1a58a3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -32,13 +32,13 @@
namespace arm_gemm {
-static const GemmImplementation<int16_t, int16_t, int32_t> gemm_s16_methods[] = {
+static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
{
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s16_8x12",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int16_t, int16_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int16_t, int32_t>(args); }
},
{
GemmMethod::DEFAULT,
@@ -50,15 +50,15 @@ static const GemmImplementation<int16_t, int16_t, int32_t> gemm_s16_methods[] =
};
template<>
-const GemmImplementation<int16_t, int16_t, int32_t> *gemm_implementation_list<int16_t, int16_t, int32_t>() {
+const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, int32_t>() {
return gemm_s16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int16_t, int16_t, int32_t> gemm<int16_t, int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<int16_t, int16_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<int16_t, int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<int16_t, int16_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<int16_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 464e2c6059..fedda3a47a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -54,7 +54,7 @@
namespace arm_gemm {
-static const GemmImplementation<int8_t, int8_t, int32_t> gemm_s8_methods[] = {
+static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef ARM_COMPUTE_ENABLE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
// SME kernels
@@ -82,48 +82,48 @@ static const GemmImplementation<int8_t, int8_t, int32_t> gemm_s8_methods[] = {
[](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL, int8_t, int32_t>(args); }
},
#endif // ARM_COMPUTE_ENABLE_SME2
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8s32_mmla_6x4VL",
[](const GemmArgs &args) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int32_t>(args); }
),
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_s8s32_mmla_8x3VL",
[](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
),
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8s32_dot_6x4VL",
[](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize>=16; },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); }
),
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_s8s32_dot_8x3VL",
[](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); }
),
#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_s8s32_mmla_8x12",
[](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>(args); }
),
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8s32_mmla_6x16",
[](const GemmArgs &args) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int32_t>(args); }
),
{
GemmMethod::GEMM_HYBRID,
@@ -144,29 +144,29 @@ GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
"a64_gemm_s16_8x12",
nullptr,
[](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int8_t, int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int32_t>(args); },
},
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8s32_dot_6x16",
[](const GemmArgs &args) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>(args); }
),
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s8_8x12",
[](const GemmArgs &args) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>(args); }
),
-GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
+GemmImplementation<int8_t, int32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s8_4x4",
nullptr,
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>(args); }
),
{
@@ -179,15 +179,15 @@ GemmImplementation<int8_t, int8_t, int32_t>::with_estimate(
};
template<>
-const GemmImplementation<int8_t, int8_t, int32_t> *gemm_implementation_list<int8_t, int8_t, int32_t>() {
+const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int32_t>() {
return gemm_s8_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int8_t, int8_t, int32_t> gemm<int8_t, int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<int8_t, int8_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<int8_t, int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<int8_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 5214a71cce..897ec9d05f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -62,12 +62,12 @@ namespace {
template<bool MergeStep, bool FixedFormat, typename OutputStage>
class kernel_and_merge {
public:
- template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+ template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
static void run (
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t b_stride, Tri *c_panel,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
@@ -76,12 +76,12 @@ public:
// Run a kernel and call the separate merge step
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<true, false, Nothing>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *c_panel,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
@@ -106,12 +106,12 @@ void kernel_and_merge<true, false, Nothing>::run(
// Run a fixed-format kernel and call the separate merge step
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<true, true, Nothing>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t b_stride, Tri *c_panel,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
@@ -136,12 +136,12 @@ void kernel_and_merge<true, true, Nothing>::run(
// Run a kernel with integrated merge
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<false, false, Nothing>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
unsigned int n_0, unsigned int n_max, const Tr *biasptr,
const Activation &act, bool accumulate, const Nothing &, const int32_t *,
@@ -175,12 +175,12 @@ void kernel_and_merge<false, false, Nothing>::run(
// Run a kernel with integrated merge, quantizing
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<false, false, Requantize32>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
unsigned int n_0, unsigned int n_max, const Tr *,
const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
@@ -211,12 +211,12 @@ void kernel_and_merge<false, false, Requantize32>::run(
// Run a kernel and call the separate quantize step
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<true, false, Requantize32>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *c_panel,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
@@ -257,12 +257,12 @@ void kernel_and_merge<true, false, Requantize32>::run(
// Run a kernel with integrated merge, dequantizing to FP32
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<false, false, DequantizeFloat>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
unsigned int n_0, unsigned int n_max, const Tr *bias,
const Activation &act, bool accumulate, const DequantizeFloat &dq, const int32_t *col_bias,
@@ -294,12 +294,12 @@ void kernel_and_merge<false, false, DequantizeFloat>::run(
}
template<>
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename Tri, typename Tab>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
void kernel_and_merge<true, false, DequantizeFloat>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *c_panel,
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
const Activation &act, bool accumulate, const DequantizeFloat &qp, const int32_t *,
@@ -394,21 +394,21 @@ struct get_stripe_width<strategy, true> {
};
// KernelWeightFormat is a similar story.
-template<typename strategy, bool FixedFormat, typename Tro>
+template<typename strategy, bool FixedFormat, typename To>
struct get_kernel_weight_format {
static KernelWeightFormat get() {
return KernelWeightFormat::NON_FIXED;
}
};
-template<typename strategy, typename Tro>
-struct get_kernel_weight_format<strategy, true, Tro> {
+template<typename strategy, typename To>
+struct get_kernel_weight_format<strategy, true, To> {
static KernelWeightFormat get() {
KernelWeightFormat kwf = strategy::kernel_weight_format();
// If we are using a BF16 kernel to do an FP32 problem (fast mode) then we need to set the BF16 flag on the
// weight format.
- if (std::is_same<Tro, float>::value && std::is_same<typename strategy::rhs_operand_type, bfloat16>::value) {
+ if (std::is_same<To, float>::value && std::is_same<typename strategy::operand_type, bfloat16>::value) {
uint32_t kwf_i = static_cast<uint32_t>(kwf);
kwf_i |= 0x10;
kwf = static_cast<KernelWeightFormat>(kwf_i);
@@ -420,10 +420,9 @@ struct get_kernel_weight_format<strategy, true, Tro> {
} // anonymous namespace
-template<typename strategy, typename Tlo, typename Tro, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool FixedFormat=false, bool ForceThreadColumns=false, bool ForceFloatAccumulate=false>
-class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
- typedef typename strategy::lhs_operand_type Tloi;
- typedef typename strategy::rhs_operand_type Troi;
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool FixedFormat=false, bool ForceThreadColumns=false, bool ForceFloatAccumulate=false>
+class GemmInterleaved : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
typedef typename strategy::result_type Tri;
typedef typename accumulate_buffer_type<strategy, OutputStage, ForceFloatAccumulate>::type Tab;
@@ -454,7 +453,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
unsigned int _Mround=0;
/* Working space, pretransposed buffer, buffer manager */
- const Troi *_B_transposed=nullptr;
+ const Toi *_B_transposed=nullptr;
void *_working_space=nullptr;
Tab *_accumulation_buffer=nullptr;
@@ -466,10 +465,10 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
int32_t *col_bias = nullptr;
/* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
- const Tlo * const * const * _indirect_buf = nullptr;
+ const To * const * const * _indirect_buf = nullptr;
/* Convolver - only set up for convolution problems, so also doubles as a flag. */
- std::unique_ptr<convolver<Tlo>> _convolver = nullptr;
+ std::unique_ptr<convolver<To>> _convolver = nullptr;
unsigned int get_col_sum_size() const {
if (std::is_same<OutputStage, Requantize32>::value) {
@@ -484,7 +483,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
class blockwalker {
private:
/* Size loops, etc. based on our parent's configuration */
- const GemmInterleaved<strategy, Tlo, Tro, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &_parent;
+ const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &_parent;
/* K, X and multi parameters for current iteration. */
unsigned int _k0=0, _x0=0, _multi=0;
@@ -499,9 +498,9 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
bool _newmulti=true;
public:
- blockwalker(const GemmInterleaved<strategy, Tlo, Tro, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &parent) : _parent(parent) { }
+ blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &parent) : _parent(parent) { }
- blockwalker(const GemmInterleaved<strategy, Tlo, Tro, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &parent,
+ blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &parent,
unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
unsigned int xmax() {
@@ -555,7 +554,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
unsigned int k_depth = _k_block;
if (std::is_same<OutputStage, Requantize32>::value) {
- k_depth += sizeof(int32_t) / sizeof(Tloi);
+ k_depth += sizeof(int32_t) / sizeof(Toi);
}
return k_depth;
@@ -565,10 +564,10 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
size_t get_a_working_size() const {
if (_thread_columns) {
// For 2D threading: allocate a buffer of one block of rows per thread
- return ROUND_UP(sizeof(Tloi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
+ return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
} else {
// For 1D threaded: one of these needed, regardless of thread count. Divided according to window.
- return ROUND_UP(sizeof(Tloi) * get_total_k_depth() * _Mround * _nbatches);
+ return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
}
}
@@ -693,7 +692,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
}
// Don't bother to block below this size threshold (1.25X target size)
- unsigned int scaling_threshold = ((target_bytes_per_block * 5) / 4) / sizeof(Tloi);
+ unsigned int scaling_threshold = ((target_bytes_per_block * 5) / 4) / sizeof(Toi);
if (get_ktotal(args) <= scaling_threshold) {
return get_ktotal(args);
@@ -701,7 +700,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
// Once we are blocking, this (lower) threshold determines when we should use more blocks
// NOTE: Could be that some factor-based solution would work better here.
- unsigned int max_block_size = target_bytes_per_block / sizeof(Tloi);
+ unsigned int max_block_size = target_bytes_per_block / sizeof(Toi);
unsigned int num_k_blocks = iceildiv(get_ktotal(args), max_block_size);
@@ -714,7 +713,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
// k_block: Find out how much of the larger array can be loaded into half the cache.
// This should account for associative caches.
- k_block = (L1_size / 2) / (sizeof(Tloi) * (std::max(strategy::out_width(), strategy::out_height())));
+ k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
// Needs to be (at least a single) multiple of the K unroll level.
k_block /= strategy::k_unroll();
@@ -762,14 +761,14 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
// x_block: Work out how many rows (of length k_block) will fit in the L2
// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
const unsigned int scaled_l2_size = (L2_size * 9) / 10;
- const unsigned int k_block_area = k_block * sizeof(Tloi) * (strategy::out_width() + strategy::out_height());
+ const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
// .. if the L1 contents is bigger than the L2, just return a minimal size block.
if (k_block_area > scaled_l2_size) {
return strategy::out_width();
}
- x_block = (scaled_l2_size - k_block_area) / (sizeof(Tloi) * k_block);
+ x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
// Needs to be (at least a single) multiple of the kernel output width.
x_block /= strategy::out_width();
@@ -867,8 +866,8 @@ public:
const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
- Tloi * const a_panel = reinterpret_cast<Tloi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
- (threadid * sizeof(Tloi) * get_total_k_depth() * strategy::out_height()));
+ Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
+ (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
for (unsigned int multi=0; multi<_nmulti; multi++) {
for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
@@ -885,8 +884,8 @@ public:
// Figure out how many "K" the kernel will actually process.
unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
- const Troi *b_ptr = FixedFormat ?
- reinterpret_cast<const Troi *>(this->_Bptr) + (multi * this->_B_multi_stride) +
+ const Toi *b_ptr = FixedFormat ?
+ reinterpret_cast<const Toi *>(this->_Bptr) + (multi * this->_B_multi_stride) +
((start_x / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
(k0 * get_stripe_width<strategy, FixedFormat>::get()) :
_B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
@@ -900,7 +899,7 @@ public:
// Set up transposed 'A' block
{
#ifdef CYCLE_PROFILING
- auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Tloi));
+ auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
#endif
// See comment above on transform_type<> class: this extracts either 'transforms' or
// 'transforms_quantized' as appropriate.
@@ -968,10 +967,10 @@ public:
// (one per thread) first, followed by the (window-divided) A
// buffer.
// Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
- Tloi * const a_panel = reinterpret_cast<Tloi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+ Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
- const Troi *b_panel;
+ const Toi *b_panel;
b_panel = _B_transposed;
// newkblock() is always true on the first iteration, so these will be set properly on the first loop.
@@ -990,7 +989,7 @@ public:
for (;!current.done();current.advance()) {
if (current.newkblock()) {
#ifdef CYCLE_PROFILING
- auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Tloi));
+ auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
#endif
// See comment above on transform_type<> class: this extracts either 'transforms' or
// 'transforms_quantized' as appropriate.
@@ -1026,7 +1025,7 @@ public:
// larger than the (rounded) K value.
if(std::is_same<OutputStage, Requantize32>::value) {
- a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
+ a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
} else {
a_panel_stride = kern_k;
}
@@ -1034,7 +1033,7 @@ public:
// For FixedFormat cases, figure out the B pointer. The loop below moves through batches and vertically through the output so this will be the same throughout.
if (FixedFormat) {
- b_panel = reinterpret_cast<const Troi *>(this->_Bptr) + (current.multi() * this->_B_multi_stride) +
+ b_panel = reinterpret_cast<const Toi *>(this->_Bptr) + (current.multi() * this->_B_multi_stride) +
((current.x0() / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
(current.k0() * get_stripe_width<strategy, FixedFormat>::get());
}
@@ -1044,7 +1043,7 @@ public:
unsigned int first_m = (batch == batch_0) ? m_0 : 0;
unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
- const Tloi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
+ const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
if (first_m >= last_m)
continue;
@@ -1166,7 +1165,7 @@ public:
unsigned int x_size = roundup(_Nsize, strategy::out_width());
- return (x_size * _Ktotal * _nmulti * sizeof(Troi)) + get_col_sum_size();
+ return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
}
size_t get_B_pretranspose_window_size() const override {
@@ -1176,7 +1175,7 @@ public:
return n_blocks * k_blocks * _nmulti;
}
- void requantize_bias(void *in_buffer, const Tro *B, const int ldb, const int B_multi_stride) override {
+ void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
if (std::is_same<OutputStage, Requantize32>::value) {
col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -1196,11 +1195,11 @@ public:
return transforms.PrepareB_supports_transpose();
}
- void pretranspose_B_array(void *in_buffer, const Tro *B, const int ldb, const int B_multi_stride, const bool transposed) override {
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, const bool transposed) override {
pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
}
- void pretranspose_B_array_part(void *in_buffer, const Tro *B, const int ldb, const int B_multi_stride, const bool transposed, size_t start, size_t end) override {
+ void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, const bool transposed, size_t start, size_t end) override {
// Perform column sums etc as part of the last block.
if (end >= get_B_pretranspose_window_size()) {
requantize_bias(in_buffer, B, ldb, B_multi_stride);
@@ -1208,7 +1207,7 @@ public:
// Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
- Troi *buffer = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
+ Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
_B_transposed = buffer;
blockwalker current(*this);
@@ -1293,7 +1292,7 @@ public:
void set_pretransposed_B_data(void *in_buffer) override {
// Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
- _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
+ _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
col_bias = reinterpret_cast<int32_t *>(in_buffer);
}
@@ -1313,14 +1312,14 @@ public:
}
}
- void set_indirect_parameters(size_t string_len, const Tlo * const * const *ptr) override {
+ void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
assert(string_len == _Ksize);
_indirect_buf = ptr;
}
void set_convolution_parameters(ConvolutionParameters parms) override {
assert(parms.input_channels == _Ksize);
- _convolver = std::unique_ptr<convolver<Tlo>>(new convolver<Tlo>(parms));
+ _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
}
// Estimate cycles for given problem given provided parameters
@@ -1331,7 +1330,7 @@ public:
const PerformanceParameters &params = strategy::template get_performance_parameters<perf_type>(args._ci);
uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
- uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Tloi);
+ uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi);
uint64_t merge_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * args._Msize * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
@@ -1358,7 +1357,7 @@ public:
c.inner_block_size = _k_block;
c.outer_block_size = _x_block;
c.filter = get_type_name<strategy>();
- c.weight_format = get_weight_format(get_kernel_weight_format<strategy, FixedFormat, Tro>::get(), sizeof(Tro));
+ c.weight_format = get_weight_format(get_kernel_weight_format<strategy, FixedFormat, To>::get(), sizeof(To));
return c;
}
@@ -1366,21 +1365,21 @@ public:
// Aliases for the variations
template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
-using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, To, Tr, OutputStage, false>;
+using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
-using GemmInterleavedFixedFormat = GemmInterleaved<strategy, To, To, Tr, OutputStage, true, true>;
+using GemmInterleavedFixedFormat = GemmInterleaved<strategy, To, Tr, OutputStage, true, true>;
template<typename strategy, typename To, typename Tr>
-using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, To, Tr, Requantize32, false>;
+using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
-template<typename strategy, typename Tlo, typename Tro, typename Tr>
-using GemmInterleavedQuantized = GemmInterleaved<strategy, Tlo, Tro, Tr, Requantize32>;
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
template<typename strategy, typename To, typename Tr>
-using GemmInterleavedNoMergeDequantized = GemmInterleaved<strategy, To, To, Tr, DequantizeFloat, false>;
+using GemmInterleavedNoMergeDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat, false>;
-template<typename strategy, typename Tlo, typename Tro, typename Tr>
-using GemmInterleavedDequantized = GemmInterleaved<strategy, Tlo, Tro, Tr, DequantizeFloat>;
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat>;
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_q8_mixed.cpp b/src/core/NEON/kernels/arm_gemm/gemm_q8_mixed.cpp
deleted file mode 100644
index a48244cb3c..0000000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_q8_mixed.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-
-#include "kernels/a64_hybrid_u8s8qa_dot_4x16.hpp"
-#include "kernels/a64_hybrid_u8s8qa_mmla_4x16.hpp"
-#include "kernels/a64_hybrid_u8s8s32_dot_6x16.hpp"
-#include "kernels/a64_hybrid_u8s8s32_mmla_6x16.hpp"
-#include "kernels/a64_interleaved_u8s8s32_mmla_8x12.hpp"
-
-#ifdef ARM_COMPUTE_ENABLE_SVE
-#include "kernels/sve_hybrid_u8s8qa_dot_4x4VL.hpp"
-#include "kernels/sve_interleaved_u8s8s32_mmla_8x3VL.hpp"
-#include "kernels/sve_hybrid_u8s8s32_mmla_6x4VL.hpp"
-#include "kernels/sve_hybrid_u8s8qa_mmla_4x4VL.hpp"
-#endif // ARM_COMPUTE_ENABLE_SVE
-
-#include "gemm_hybrid_indirect.hpp"
-#include "gemm_hybrid_quantized.hpp"
-#include "gemm_interleaved.hpp"
-#include "gemv_pretransposed.hpp"
-#include "quantize_wrapper.hpp"
-#include "utils.hpp"
-
-namespace arm_gemm {
-
-static const GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32> gemm_q8_mixed_methods[] =
-{
-#ifdef ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_HYBRID,
- "sve_hybrid_u8s8qa_mmla_4x4VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8s8qa_mmla_4x4VL, uint8_t, int8_t, uint8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8s8qa_mmla_4x4VL, uint8_t, int8_t, uint8_t, Requantize32>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "sve_interleaved_u8s8s32_mmla_8x3VL",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8s8s32_mmla_8x3VL, uint8_t, int8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8s8s32_mmla_8x3VL, uint8_t, int8_t, uint8_t>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "sve_hybrid_u8s8s32_mmla_6x4VL",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8s8s32_mmla_6x4VL, uint8_t, int8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8s8s32_mmla_6x4VL, uint8_t, int8_t, uint8_t, Requantize32, true>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_HYBRID,
- "sve_hybrid_u8s8qa_dot_4x4VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8s8qa_dot_4x4VL, uint8_t, int8_t, uint8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8s8qa_dot_4x4VL, uint8_t, int8_t, uint8_t, Requantize32>(args, qp); }
-),
-#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_HYBRID,
- "a64_hybrid_u8s8qa_mmla_4x16",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8s8qa_mmla_4x16, uint8_t, int8_t, uint8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8s8qa_mmla_4x16, uint8_t, int8_t, uint8_t, Requantize32>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "a64_interleaved_u8s8s32_mmla_8x12",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_u8s8s32_mmla_8x12, uint8_t, int8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8s8s32_mmla_8x12, uint8_t, int8_t, uint8_t>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "a64_hybrid_u8s8s32_mmla_6x16",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8s8s32_mmla_6x16, uint8_t, int8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8s8s32_mmla_6x16, uint8_t, int8_t, uint8_t, Requantize32, true>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_HYBRID,
- "a64_hybrid_u8s8qa_dot_4x16",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_svei8mm() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8s8qa_dot_4x16, uint8_t, int8_t, uint8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8s8qa_dot_4x16, uint8_t, int8_t, uint8_t, Requantize32>(args, qp); }
-),
-GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32>::with_estimate(
- GemmMethod::GEMM_HYBRID,
- "a64_hybrid_u8s8s32_dot_6x16",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._ci->has_i8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8s8s32_dot_6x16, uint8_t, int8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8s8s32_dot_6x16, uint8_t, int8_t, uint8_t, Requantize32, true>(args, qp); }
-),
-{
- GemmMethod::DEFAULT,
- "",
- nullptr,
- nullptr,
- nullptr
-}
-};
-
-template<>
-const GemmImplementation<uint8_t, int8_t, uint8_t, Requantize32> *gemm_implementation_list<uint8_t, int8_t, uint8_t, Requantize32>() {
- return gemm_q8_mixed_methods;
-}
-
-template UniqueGemmCommon<uint8_t, int8_t, uint8_t> gemm<uint8_t, int8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-template bool has_opt_gemm<uint8_t, int8_t, uint8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &);
-template KernelDescription get_gemm_method<uint8_t, int8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 18008e713e..321c97262f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -58,6 +58,7 @@
#include "gemm_hybrid_indirect.hpp"
#include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
#include "gemm_interleaved.hpp"
#include "gemv_pretransposed.hpp"
#include "quantize_wrapper.hpp"
@@ -65,7 +66,7 @@
namespace arm_gemm {
-static const GemmImplementation<int8_t, int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
+static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
{
#ifdef ARM_COMPUTE_ENABLE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
@@ -100,90 +101,90 @@ static const GemmImplementation<int8_t, int8_t, int8_t, Requantize32> gemm_qint8
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL, int8_t, int8_t>(args, qp); }
},
#endif // ARM_COMPUTE_ENABLE_SME2
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8qa_mmla_4x4VL",
[](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8qs_mmla_6x4VL",
[](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_s8s32_mmla_8x3VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_hybrid_s8s32_mmla_6x4VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, int8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8qs_dot_6x4VL",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_symmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8qa_dot_4x4VL",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_s8s32_dot_6x4VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, int8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_s8s32_dot_8x3VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); }
),
#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8qa_mmla_4x16",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8qs_mmla_6x16",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_symmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_s8s32_mmla_8x12",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_hybrid_s8s32_mmla_6x16",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, int8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
),
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
@@ -204,42 +205,42 @@ GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
"a64_gemm_s16_8x12",
nullptr,
[](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s16_8x12, int8_t, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s16_8x12, int8_t, int8_t>(args, qp); }
},
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8qs_dot_6x16",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8qa_dot_4x16",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, int8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_s8s32_dot_6x16",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, int8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s8_8x12",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s8_4x4",
nullptr,
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>(args, qp); }
),
{
GemmMethod::QUANTIZE_WRAPPER,
@@ -258,14 +259,14 @@ GemmImplementation<int8_t, int8_t, int8_t, Requantize32>::with_estimate(
};
template<>
-const GemmImplementation<int8_t, int8_t, int8_t, Requantize32> *gemm_implementation_list<int8_t, int8_t, int8_t, Requantize32>() {
+const GemmImplementation<int8_t, int8_t, Requantize32> *gemm_implementation_list<int8_t, int8_t, Requantize32>() {
return gemm_qint8_methods;
}
-template UniqueGemmCommon<int8_t, int8_t, int8_t> gemm<int8_t, int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-template bool has_opt_gemm<int8_t, int8_t, int8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
-template KernelDescription get_gemm_method<int8_t, int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template UniqueGemmCommon<int8_t, int8_t> gemm<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template bool has_opt_gemm<int8_t, int8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
+template KernelDescription get_gemm_method<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index 7c182b6777..93eecf991e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -54,13 +54,14 @@
#include "gemm_hybrid_indirect.hpp"
#include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
#include "gemm_interleaved.hpp"
#include "gemv_pretransposed.hpp"
#include "quantize_wrapper.hpp"
namespace arm_gemm {
-static const GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
+static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
{
#ifdef ARM_COMPUTE_ENABLE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
@@ -96,69 +97,69 @@ static const GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32> gemm_qu
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL, uint8_t, uint8_t>(args, qp); }
},
#endif // ARM_COMPUTE_ENABLE_SME2
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_u8qa_mmla_4x4VL",
[](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, uint8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_u8u32_mmla_8x3VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_hybrid_u8u32_mmla_6x4VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_u8qa_dot_4x4VL",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, uint8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_u8u32_dot_6x4VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_u8u32_dot_8x3VL",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); }
),
#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_u8qa_mmla_4x16",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, uint8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_u8u32_mmla_8x12",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_hybrid_u8u32_mmla_6x16",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
),
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
@@ -179,35 +180,35 @@ GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
"a64_gemm_u16_8x12",
nullptr,
[](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u16_8x12, uint8_t, uint8_t, uint8_t>(args, qp); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u16_8x12, uint8_t, uint8_t>(args, qp); },
},
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_u8qa_dot_4x16",
[](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, uint8_t, Requantize32>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_u8u32_dot_6x16",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_u8_8x12",
[](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>(args, qp); }
),
-GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_u8_4x4",
nullptr,
- [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>(args, qp); }
),
{
GemmMethod::QUANTIZE_WRAPPER,
@@ -226,14 +227,14 @@ GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32>::with_estimate(
};
template<>
-const GemmImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *gemm_implementation_list<uint8_t, uint8_t, uint8_t, Requantize32>() {
+const GemmImplementation<uint8_t, uint8_t, Requantize32> *gemm_implementation_list<uint8_t, uint8_t, Requantize32>() {
return gemm_quint8_methods;
}
-template UniqueGemmCommon<uint8_t, uint8_t, uint8_t> gemm<uint8_t, uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-template bool has_opt_gemm<uint8_t, uint8_t, uint8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
-template KernelDescription get_gemm_method<uint8_t, uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
-template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template UniqueGemmCommon<uint8_t, uint8_t> gemm<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template bool has_opt_gemm<uint8_t, uint8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
+template KernelDescription get_gemm_method<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
index 1d995a87b5..38d9b763f6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
@@ -48,7 +48,7 @@
#include <vector>
namespace arm_gemm {
-static const GemmImplementation<int8_t, int8_t, float, DequantizeFloat> gemm_s8fp32_methods[] =
+static const GemmImplementation<int8_t, float, DequantizeFloat> gemm_s8fp32_methods[] =
{
#ifdef ARM_COMPUTE_ENABLE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
@@ -76,48 +76,48 @@ static const GemmImplementation<int8_t, int8_t, float, DequantizeFloat> gemm_s8f
[](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL, int8_t, float>(args, dq); }
},
#endif // ARM_COMPUTE_ENABLE_SME2
-GemmImplementation<int8_t, int8_t, float, DequantizeFloat>::with_estimate(
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_s8s32_mmla_8x3VL",
[](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t, float>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t, float>(args, qp); }
+ [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, float>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, float>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, float, DequantizeFloat>::with_estimate(
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_s8s32_dot_8x3VL",
[](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sve(); },
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t, float>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t, float>(args, qp); }
+ [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, float>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, float>(args, qp); }
),
#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<int8_t, int8_t, float, DequantizeFloat>::with_estimate(
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_s8s32_mmla_8x12",
[](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t, float>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t, float>(args, qp); }
+ [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, float>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, float>(args, qp); }
),
{
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s16_8x12",
nullptr,
[](const GemmArgs &args, const DequantizeFloat &) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s16_8x12, int8_t, int8_t, float>(args, qp); }
+ [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s16_8x12, int8_t, float>(args, qp); }
},
-GemmImplementation<int8_t, int8_t, float, DequantizeFloat>::with_estimate(
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s8_8x12",
[](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_gemm_s8_8x12, int8_t, int8_t, float>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s8_8x12, int8_t, int8_t, float>(args, qp); }
+ [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_gemm_s8_8x12, int8_t, float>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s8_8x12, int8_t, float>(args, qp); }
),
-GemmImplementation<int8_t, int8_t, float, DequantizeFloat>::with_estimate(
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_s8_4x4",
nullptr,
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_gemm_s8_4x4, int8_t, int8_t, float>::estimate_cycles<int8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s8_4x4, int8_t, int8_t, float>(args, qp); }
+ [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_gemm_s8_4x4, int8_t, float>::estimate_cycles<int8_t>(args); },
+ [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s8_4x4, int8_t, float>(args, qp); }
),
{
GemmMethod::DEFAULT,
@@ -129,13 +129,13 @@ GemmImplementation<int8_t, int8_t, float, DequantizeFloat>::with_estimate(
};
template<>
-const GemmImplementation<int8_t, int8_t, float, DequantizeFloat> *gemm_implementation_list<int8_t, int8_t, float, DequantizeFloat>() {
+const GemmImplementation<int8_t, float, DequantizeFloat> *gemm_implementation_list<int8_t, float, DequantizeFloat>() {
return gemm_s8fp32_methods;
}
-template UniqueGemmCommon<int8_t, int8_t, float> gemm<int8_t, int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
-template KernelDescription get_gemm_method<int8_t, int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
-template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
+template UniqueGemmCommon<int8_t, float> gemm<int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
+template KernelDescription get_gemm_method<int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_u8s8fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_u8s8fp32.cpp
deleted file mode 100644
index 606b422b0b..0000000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_u8s8fp32.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-
-#include "kernels/a64_hybrid_u8s8qa_dot_4x16.hpp"
-#include "kernels/a64_hybrid_u8s8qa_mmla_4x16.hpp"
-#include "kernels/a64_hybrid_u8s8s32_dot_6x16.hpp"
-#include "kernels/a64_hybrid_u8s8s32_mmla_6x16.hpp"
-#include "kernels/a64_interleaved_u8s8s32_mmla_8x12.hpp"
-
-#ifdef ARM_COMPUTE_ENABLE_SVE
-#include "kernels/sve_hybrid_u8s8qa_dot_4x4VL.hpp"
-#include "kernels/sve_interleaved_u8s8s32_mmla_8x3VL.hpp"
-#include "kernels/sve_hybrid_u8s8s32_mmla_6x4VL.hpp"
-#include "kernels/sve_hybrid_u8s8qa_mmla_4x4VL.hpp"
-#endif // ARM_COMPUTE_ENABLE_SVE
-
-#include "gemm_hybrid_indirect.hpp"
-#include "gemm_hybrid_quantized.hpp"
-#include "gemm_interleaved.hpp"
-#include "gemv_pretransposed.hpp"
-#include "quantize_wrapper.hpp"
-#include "utils.hpp"
-
-namespace arm_gemm {
-
-static const GemmImplementation<uint8_t, int8_t, float, DequantizeFloat> gemm_u8s8fp32_methods[] =
-{
-#ifdef ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, int8_t, float, DequantizeFloat>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "sve_interleaved_u8s8s32_mmla_8x3VL",
- [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_u8s8s32_mmla_8x3VL, uint8_t, int8_t, float>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_u8s8s32_mmla_8x3VL, uint8_t, int8_t, float>(args, qp); }
-),
-#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, int8_t, float, DequantizeFloat>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "a64_interleaved_u8s8s32_mmla_8x12",
- [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_interleaved_u8s8s32_mmla_8x12, uint8_t, int8_t, float>::estimate_cycles<uint8_t>(args); },
- [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_interleaved_u8s8s32_mmla_8x12, uint8_t, int8_t, float>(args, qp); }
-),
-{
- GemmMethod::DEFAULT,
- "",
- nullptr,
- nullptr,
- nullptr
-}
-};
-
-template<>
-const GemmImplementation<uint8_t, int8_t, float, DequantizeFloat> *gemm_implementation_list<uint8_t, int8_t, float, DequantizeFloat>() {
- return gemm_u8s8fp32_methods;
-}
-
-template UniqueGemmCommon<uint8_t, int8_t, float> gemm<uint8_t, int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
-template bool has_opt_gemm<uint8_t, int8_t, float, DequantizeFloat>(WeightFormat &weight_format, const GemmArgs &args, const DequantizeFloat &os);
-template KernelDescription get_gemm_method<uint8_t, int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
-template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 308452e304..44f085c183 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -32,13 +32,13 @@
namespace arm_gemm {
-static const GemmImplementation<uint16_t, uint16_t, uint32_t> gemm_u16_methods[] = {
+static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
{
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_u16_8x12",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint16_t, uint16_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint16_t, uint32_t>(args); }
},
{
GemmMethod::DEFAULT,
@@ -50,15 +50,15 @@ static const GemmImplementation<uint16_t, uint16_t, uint32_t> gemm_u16_methods[]
};
template<>
-const GemmImplementation<uint16_t, uint16_t, uint32_t> *gemm_implementation_list<uint16_t, uint16_t, uint32_t>() {
+const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t, uint32_t>() {
return gemm_u16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint16_t, uint16_t, uint32_t> gemm<uint16_t, uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<uint16_t, uint16_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<uint16_t, uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<uint16_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index fb442419b7..dfacb687a8 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -46,50 +46,50 @@
namespace arm_gemm {
-static const GemmImplementation<uint8_t, uint8_t, uint32_t> gemm_u8_methods[] = {
+static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_u8u32_mmla_6x4VL",
[](const GemmArgs &args) { return args._ci->has_svei8mm(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint32_t>(args); }
),
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_u8u32_mmla_8x3VL",
[](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); }
),
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"sve_hybrid_u8u32_dot_6x4VL",
[](const GemmArgs &args) { return args._ci->has_sve(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); }
),
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sve_interleaved_u8u32_dot_8x3VL",
[](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); }
),
#endif // ARM_COMPUTE_ENABLE_SVE
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_interleaved_u8u32_mmla_8x12",
[](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>(args); }
),
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_u8u32_mmla_6x16",
[](const GemmArgs &args) { return args._ci->has_i8mm(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint32_t>(args); }
),
{
GemmMethod::GEMM_HYBRID,
@@ -110,28 +110,28 @@ GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
"a64_gemm_u16_8x12",
nullptr,
[](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint8_t, uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint32_t>(args); },
},
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_HYBRID,
"a64_hybrid_u8u32_dot_6x16",
[](const GemmArgs &args) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>(args); }
),
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_u8_8x12",
[](const GemmArgs &args) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>(args); }
),
-GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"a64_gemm_u8_4x4",
nullptr,
- [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
- [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>(args); }
),
{
GemmMethod::DEFAULT,
@@ -143,15 +143,15 @@ GemmImplementation<uint8_t, uint8_t, uint32_t>::with_estimate(
};
template<>
-const GemmImplementation<uint8_t, uint8_t, uint32_t> *gemm_implementation_list<uint8_t, uint8_t, uint32_t>() {
+const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, uint32_t>() {
return gemm_u8_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint8_t, uint8_t, uint32_t> gemm<uint8_t, uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template bool has_opt_gemm<uint8_t, uint8_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
-template KernelDescription get_gemm_method<uint8_t, uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
-template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint32_t, Nothing> (const GemmArgs &args, const Nothing &);
+template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<uint8_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index aa03fb6aa1..ad504f2664 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm {
* efficiently as a GEMM (with M'=nbatches and nbatches'=1). This wrapper
* implements this. */
template<typename To, typename Tr>
-class GemvBatched : public GemmCommon<To, To, Tr> {
+class GemvBatched : public GemmCommon<To, Tr> {
private:
- UniqueGemmCommon<To, To, Tr> _subgemm = nullptr;
+ UniqueGemmCommon<To, Tr> _subgemm = nullptr;
public:
GemvBatched(const GemmArgs &args) {
@@ -42,7 +42,7 @@ public:
newargs._Msize = args._nbatches;
newargs._nbatches = 1;
newargs._cfg = nullptr;
- _subgemm = gemm<To,To,Tr>(newargs);
+ _subgemm = gemm<To,Tr>(newargs);
}
void set_arrays(const To *A, const int, const int A_batch_stride, const int A_multi_stride,
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 1a7c51c7a4..dbada36052 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -85,7 +85,7 @@ void run_gemv_kernel<Requantize32>::run(
//
// batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM).
template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
-class GemvPretransposed : public GemmCommon<To, To, Tr> {
+class GemvPretransposed : public GemmCommon<To, Tr> {
typedef typename strategy::operand_type Toi;
typedef typename strategy::result_type Tri;
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
index 3ad32d8a50..ffd9d4b22a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
@@ -75,61 +75,61 @@ void interleave_block<6, 1, VLType::None, false>(
for (;width>7;width-=8) {
__asm __volatile (
// Load up 8 elements (2 vectors) from each of 8 sources.
- "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
- "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
- "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
- "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
- "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
- "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
- "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
- "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
- "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+ "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
+ "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
+ "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
+ "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
+ "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+ "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
+ "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
+ "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
+ "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
ASM_PREFETCH("[%[inptr0], #128]")
- "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+ "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
// Store first elements
- "VST1.32 {d0-d1}, [%[outptr]]!\n"
- "VST1.32 {d16}, [%[outptr]]!\n"
+ "VST1.32 {d0-d1}, [%[outptr]]!\n"
+ "VST1.32 {d16}, [%[outptr]]!\n"
- "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+ "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
// Store second elements
- "VST1.32 {d4-d5}, [%[outptr]]!\n"
- "VZIP.32 q1, q5\n"
+ "VST1.32 {d4-d5}, [%[outptr]]!\n"
+ "VZIP.32 q1, q5\n"
ASM_PREFETCH("[%[inptr1], #128]")
- "VST1.32 {d17}, [%[outptr]]!\n"
- "VZIP.32 q3, q7\n"
+ "VST1.32 {d17}, [%[outptr]]!\n"
+ "VZIP.32 q3, q7\n"
// Store third elements
- "VZIP.32 q9, q11\n"
- "VST1.32 {d8-d9}, [%[outptr]]!\n"
- "VZIP.32 q1, q3\n"
+ "VZIP.32 q9, q11\n"
+ "VST1.32 {d8-d9}, [%[outptr]]!\n"
+ "VZIP.32 q1, q3\n"
ASM_PREFETCH("[%[inptr2], #128]")
- "VST1.32 {d20}, [%[outptr]]!\n"
+ "VST1.32 {d20}, [%[outptr]]!\n"
// Store fourth elements
- "VZIP.32 q5, q7\n"
- "VST1.32 {d12-d13}, [%[outptr]]!\n"
+ "VZIP.32 q5, q7\n"
+ "VST1.32 {d12-d13}, [%[outptr]]!\n"
ASM_PREFETCH("[%[inptr3], #128]")
- "VST1.32 {d21}, [%[outptr]]!\n"
+ "VST1.32 {d21}, [%[outptr]]!\n"
// Fifth
- "VST1.32 {d2-d3}, [%[outptr]]!\n"
+ "VST1.32 {d2-d3}, [%[outptr]]!\n"
ASM_PREFETCH("[%[inptr4], #128]")
- "VST1.32 {d18}, [%[outptr]]!\n"
+ "VST1.32 {d18}, [%[outptr]]!\n"
// Sixth
- "VST1.32 {d6-d7}, [%[outptr]]!\n"
+ "VST1.32 {d6-d7}, [%[outptr]]!\n"
ASM_PREFETCH("[%[inptr5], #128]")
- "VST1.32 {d19}, [%[outptr]]!\n"
+ "VST1.32 {d19}, [%[outptr]]!\n"
// Seventh
- "VST1.32 {d10-d11}, [%[outptr]]!\n"
- "VST1.32 {d22}, [%[outptr]]!\n"
+ "VST1.32 {d10-d11}, [%[outptr]]!\n"
+ "VST1.32 {d22}, [%[outptr]]!\n"
// Eighth
- "VST1.32 {d14-d15}, [%[outptr]]!\n"
- "VST1.32 {d23}, [%[outptr]]!\n"
+ "VST1.32 {d14-d15}, [%[outptr]]!\n"
+ "VST1.32 {d23}, [%[outptr]]!\n"
: [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
[inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
index 4d065300ae..d5a41a332d 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -34,9 +34,9 @@ void interleave_block<4, 16, VLType::None, false>(
"ldr x23, [%x[in], #0x0]\n"
"ldr x22, [%x[in], #0x8]\n"
"cmp %x[height], #0x4\n"
+ "add x23, x23, %x[row_offset]\n"
"ldr x21, [%x[in], #0x10]\n"
"ldr x20, [%x[in], #0x18]\n"
- "add x23, x23, %x[row_offset]\n"
"add x22, x22, %x[row_offset]\n"
"add x21, x21, %x[row_offset]\n"
"add x20, x20, %x[row_offset]\n"
@@ -60,12 +60,12 @@ void interleave_block<4, 16, VLType::None, false>(
"ldr q19, [x23], #0x10\n"
"ldr q18, [x22], #0x10\n"
"subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
"ldr q17, [x21], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "cmp %x[width], #0x10\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "str q19, [%x[out_ptr], #0x0]\n"
"str q18, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x21, #0x70]\n"
"prfm pldl1keep, [x20, #0x70]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
index 1cd6523c76..35de179ed4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #1\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #1\n"
"add x27, x27, %x[row_offset], LSL #1\n"
"add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #1\n"
"add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #1\n"
"add x22, x22, %x[row_offset], LSL #1\n"
"add x21, x21, %x[row_offset], LSL #1\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,52 +79,52 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q23, [x28], #0x10\n"
- "ldr q25, [x27], #0x10\n"
+ "ldr q25, [x28], #0x10\n"
+ "ldr q27, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
"cmp %x[width], #0x8\n"
- "ldr q19, [x24], #0x10\n"
- "ldr q18, [x23], #0x10\n"
+ "ldr q26, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
+ "ldr q21, [x24], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip1 v23.8h, v25.8h, v21.8h\n"
+ "zip1 v22.8h, v27.8h, v20.8h\n"
"ldr q17, [x22], #0x10\n"
"ldr q16, [x21], #0x10\n"
+ "zip1 v19.8h, v26.8h, v17.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
+ "zip2 v25.8h, v25.8h, v21.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip1 v20.8h, v23.8h, v19.8h\n"
- "zip1 v24.8h, v25.8h, v18.8h\n"
- "zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v25.8h, v25.8h, v18.8h\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v19.8h, v22.8h, v17.8h\n"
- "zip1 v18.8h, v21.8h, v16.8h\n"
+ "zip1 v24.8h, v23.8h, v19.8h\n"
+ "zip1 v17.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "zip2 v22.8h, v22.8h, v17.8h\n"
- "zip2 v17.8h, v21.8h, v16.8h\n"
+ "zip2 v23.8h, v23.8h, v19.8h\n"
+ "zip2 v19.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v24.8h, v18.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip2 v19.8h, v24.8h, v18.8h\n"
- "zip1 v24.8h, v23.8h, v22.8h\n"
- "zip1 v18.8h, v25.8h, v17.8h\n"
- "zip2 v23.8h, v23.8h, v22.8h\n"
- "zip2 v22.8h, v25.8h, v17.8h\n"
- "zip1 v17.8h, v21.8h, v16.8h\n"
- "zip2 v16.8h, v21.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v24.8h, v18.8h\n"
- "zip2 v18.8h, v24.8h, v18.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v22.8h, v25.8h, v21.8h\n"
+ "zip1 v18.8h, v20.8h, v16.8h\n"
+ "zip2 v21.8h, v25.8h, v21.8h\n"
+ "zip2 v20.8h, v20.8h, v16.8h\n"
+ "zip1 v16.8h, v24.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v24.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip1 v17.8h, v23.8h, v22.8h\n"
- "zip2 v16.8h, v23.8h, v22.8h\n"
- "str q21, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip1 v19.8h, v22.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v18.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v16.8h, v21.8h, v20.8h\n"
"str q19, [%x[out_ptr], #0x40]\n"
"str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
index 81c758f498..59981e9979 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #1\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #1\n"
"add x27, x27, %x[row_offset], LSL #1\n"
"add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #1\n"
"add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #1\n"
"add x22, x22, %x[row_offset], LSL #1\n"
"add x21, x21, %x[row_offset], LSL #1\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,53 +79,53 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d25, [x28], #0x8\n"
- "ldr d24, [x27], #0x8\n"
+ "ldr d27, [x28], #0x8\n"
+ "ldr d26, [x27], #0x8\n"
+ "fcvtl v27.4s, v27.4h\n"
+ "fcvtl v26.4s, v26.4h\n"
+ "ldr d22, [x26], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "fcvtl v22.4s, v22.4h\n"
+ "fcvtl v21.4s, v21.4h\n"
+ "ldr d20, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "fcvtl v20.4s, v20.4h\n"
+ "fcvtl v25.4s, v25.4h\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
+ "fcvtl v19.4s, v19.4h\n"
+ "fcvtl v16.4s, v16.4h\n"
+ "zip1 v24.4s, v27.4s, v22.4s\n"
+ "zip1 v23.4s, v26.4s, v21.4s\n"
"subs %x[width], %x[width], #0x4\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
"cmp %x[width], #0x4\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d16, [x21], #0x8\n"
- "fcvtl v25.4s, v25.4h\n"
- "fcvtl v24.4s, v24.4h\n"
- "fcvtl v18.4s, v18.4h\n"
- "fcvtl v17.4s, v17.4h\n"
+ "zip1 v18.4s, v20.4s, v19.4s\n"
+ "zip1 v17.4s, v25.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "fcvtl v23.4s, v23.4h\n"
- "fcvtl v22.4s, v22.4h\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "zip2 v21.4s, v26.4s, v21.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "fcvtl v21.4s, v21.4h\n"
- "fcvtl v16.4s, v16.4h\n"
+ "zip2 v20.4s, v20.4s, v19.4s\n"
+ "zip2 v19.4s, v25.4s, v16.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "zip1 v20.4s, v25.4s, v18.4s\n"
- "zip1 v19.4s, v24.4s, v17.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip2 v26.4s, v25.4s, v18.4s\n"
- "zip2 v25.4s, v24.4s, v17.4s\n"
- "zip1 v18.4s, v23.4s, v21.4s\n"
- "zip1 v17.4s, v22.4s, v16.4s\n"
- "zip2 v24.4s, v23.4s, v21.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "zip2 v22.4s, v20.4s, v19.4s\n"
- "zip1 v21.4s, v18.4s, v17.4s\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "zip1 v19.4s, v26.4s, v25.4s\n"
- "zip1 v18.4s, v24.4s, v23.4s\n"
+ "zip1 v16.4s, v24.4s, v23.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v17.4s, v26.4s, v25.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"zip2 v16.4s, v24.4s, v23.4s\n"
- "str q21, [%x[out_ptr], #0x10]\n"
- "str q22, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
- "str q19, [%x[out_ptr], #0x40]\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v17.4s, v18.4s, v17.4s\n"
+ "zip1 v16.4s, v22.4s, v21.4s\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "zip1 v18.4s, v20.4s, v19.4s\n"
+ "zip2 v17.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
"str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
@@ -134,76 +134,76 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr s29, [x28], #0x4\n"
- "ldr s28, [x27], #0x4\n"
+ "ldr s28, [x28], #0x4\n"
+ "ldr s27, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s27, [x26], #0x4\n"
- "ldr s26, [x25], #0x4\n"
- "ldr s25, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s23, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s24, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v29.h }[2], [x28]\n"
- "ld1 { v28.h }[2], [x27]\n"
+ "ld1 { v28.h }[2], [x28]\n"
+ "ld1 { v27.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v27.h }[2], [x26]\n"
- "ld1 { v26.h }[2], [x25]\n"
- "ld1 { v25.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v23.h }[2], [x22]\n"
- "ld1 { v22.h }[2], [x21]\n"
+ "ld1 { v26.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v21.h }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr h29, [x28, #0x0]\n"
- "ldr h28, [x27, #0x0]\n"
+ "ldr h28, [x28, #0x0]\n"
+ "ldr h27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h27, [x26, #0x0]\n"
- "ldr h26, [x25, #0x0]\n"
- "ldr h25, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h23, [x22, #0x0]\n"
- "ldr h22, [x21, #0x0]\n"
+ "ldr h26, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
+ "ldr h24, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h21, [x21, #0x0]\n"
"5:" // Odd load end
- "fcvtl v29.4s, v29.4h\n"
"fcvtl v28.4s, v28.4h\n"
- "subs x20, x20, #0x1\n"
"fcvtl v27.4s, v27.4h\n"
+ "subs x20, x20, #0x1\n"
"fcvtl v26.4s, v26.4h\n"
"fcvtl v25.4s, v25.4h\n"
"fcvtl v24.4s, v24.4h\n"
"fcvtl v23.4s, v23.4h\n"
"fcvtl v22.4s, v22.4h\n"
- "zip1 v21.4s, v29.4s, v27.4s\n"
+ "fcvtl v21.4s, v21.4h\n"
"zip1 v20.4s, v28.4s, v26.4s\n"
- "zip1 v19.4s, v25.4s, v23.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"zip1 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v21.4s, v20.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
"subs x20, x20, #0x1\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v19.4s, v29.4s, v27.4s\n"
- "zip2 v17.4s, v28.4s, v26.4s\n"
- "zip2 v18.4s, v25.4s, v23.4s\n"
- "zip2 v16.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v19.4s, v17.4s\n"
- "zip1 v16.4s, v18.4s, v16.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v28.4s, v26.4s\n"
+ "zip2 v16.4s, v27.4s, v25.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
index 521b3bf8f5..9eeabfa9eb 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #2\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #2\n"
"add x27, x27, %x[row_offset], LSL #2\n"
"add x26, x26, %x[row_offset], LSL #2\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #2\n"
"add x24, x24, %x[row_offset], LSL #2\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #2\n"
"add x22, x22, %x[row_offset], LSL #2\n"
"add x21, x21, %x[row_offset], LSL #2\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,46 +79,46 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q25, [x28], #0x10\n"
- "ldr q24, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q18, [x27], #0x10\n"
"subs %x[width], %x[width], #0x4\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
"cmp %x[width], #0x4\n"
- "ldr q23, [x24], #0x10\n"
- "ldr q22, [x23], #0x10\n"
- "ldr q21, [x22], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "ldr q16, [x25], #0x10\n"
+ "zip1 v25.4s, v20.4s, v17.4s\n"
+ "zip1 v24.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x24], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip2 v22.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x22], #0x10\n"
"ldr q16, [x21], #0x10\n"
- "zip1 v20.4s, v25.4s, v18.4s\n"
- "zip1 v19.4s, v24.4s, v17.4s\n"
- "zip2 v26.4s, v25.4s, v18.4s\n"
- "zip2 v25.4s, v24.4s, v17.4s\n"
+ "zip1 v20.4s, v19.4s, v18.4s\n"
+ "zip1 v17.4s, v23.4s, v16.4s\n"
+ "zip2 v19.4s, v19.4s, v18.4s\n"
+ "zip2 v18.4s, v23.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v18.4s, v23.4s, v21.4s\n"
- "zip1 v17.4s, v22.4s, v16.4s\n"
- "zip2 v24.4s, v23.4s, v21.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v16.4s, v20.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "zip2 v22.4s, v20.4s, v19.4s\n"
- "zip1 v21.4s, v18.4s, v17.4s\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "zip1 v19.4s, v26.4s, v25.4s\n"
- "zip1 v18.4s, v24.4s, v23.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v17.4s, v26.4s, v25.4s\n"
- "zip2 v16.4s, v24.4s, v23.4s\n"
- "str q21, [%x[out_ptr], #0x10]\n"
- "str q22, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
- "str q19, [%x[out_ptr], #0x40]\n"
- "str q18, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v20.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -126,68 +126,68 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr d29, [x28], #0x8\n"
- "ldr d28, [x27], #0x8\n"
+ "ldr d28, [x28], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
"mov x20, #0x2\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d25, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v29.s }[2], [x28]\n"
- "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v28.s }[2], [x28]\n"
+ "ld1 { v27.s }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v26.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v25.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr s29, [x28, #0x0]\n"
- "ldr s28, [x27, #0x0]\n"
+ "ldr s28, [x28, #0x0]\n"
+ "ldr s27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s26, [x25, #0x0]\n"
- "ldr s25, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s25, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
"5:" // Odd load end
- "zip1 v21.4s, v29.4s, v27.4s\n"
"zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v19.4s, v25.4s, v23.4s\n"
"zip1 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v21.4s, v20.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
"subs x20, x20, #0x1\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v19.4s, v29.4s, v27.4s\n"
- "zip2 v17.4s, v28.4s, v26.4s\n"
- "zip2 v18.4s, v25.4s, v23.4s\n"
- "zip2 v16.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v19.4s, v17.4s\n"
- "zip1 v16.4s, v18.4s, v16.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v28.4s, v26.4s\n"
+ "zip2 v16.4s, v27.4s, v25.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
index 8f67a21d05..27b3335694 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #1\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #1\n"
"add x27, x27, %x[row_offset], LSL #1\n"
"add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #1\n"
"add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #1\n"
"add x22, x22, %x[row_offset], LSL #1\n"
"add x21, x21, %x[row_offset], LSL #1\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,52 +79,52 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q23, [x28], #0x10\n"
- "ldr q25, [x27], #0x10\n"
+ "ldr q25, [x28], #0x10\n"
+ "ldr q27, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
"cmp %x[width], #0x8\n"
- "ldr q19, [x24], #0x10\n"
- "ldr q18, [x23], #0x10\n"
+ "ldr q26, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
+ "ldr q21, [x24], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip1 v23.8h, v25.8h, v21.8h\n"
+ "zip1 v22.8h, v27.8h, v20.8h\n"
"ldr q17, [x22], #0x10\n"
"ldr q16, [x21], #0x10\n"
+ "zip1 v19.8h, v26.8h, v17.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
+ "zip2 v25.8h, v25.8h, v21.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip1 v20.8h, v23.8h, v19.8h\n"
- "zip1 v24.8h, v25.8h, v18.8h\n"
- "zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v25.8h, v25.8h, v18.8h\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v19.8h, v22.8h, v17.8h\n"
- "zip1 v18.8h, v21.8h, v16.8h\n"
+ "zip1 v24.8h, v23.8h, v19.8h\n"
+ "zip1 v17.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "zip2 v22.8h, v22.8h, v17.8h\n"
- "zip2 v17.8h, v21.8h, v16.8h\n"
+ "zip2 v23.8h, v23.8h, v19.8h\n"
+ "zip2 v19.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v24.8h, v18.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip2 v19.8h, v24.8h, v18.8h\n"
- "zip1 v24.8h, v23.8h, v22.8h\n"
- "zip1 v18.8h, v25.8h, v17.8h\n"
- "zip2 v23.8h, v23.8h, v22.8h\n"
- "zip2 v22.8h, v25.8h, v17.8h\n"
- "zip1 v17.8h, v21.8h, v16.8h\n"
- "zip2 v16.8h, v21.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v24.8h, v18.8h\n"
- "zip2 v18.8h, v24.8h, v18.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v22.8h, v25.8h, v21.8h\n"
+ "zip1 v18.8h, v20.8h, v16.8h\n"
+ "zip2 v21.8h, v25.8h, v21.8h\n"
+ "zip2 v20.8h, v20.8h, v16.8h\n"
+ "zip1 v16.8h, v24.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v24.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip1 v17.8h, v23.8h, v22.8h\n"
- "zip2 v16.8h, v23.8h, v22.8h\n"
- "str q21, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip1 v19.8h, v22.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v18.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v16.8h, v21.8h, v20.8h\n"
"str q19, [%x[out_ptr], #0x40]\n"
"str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
index 0a76fa812e..4c90691edc 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset]\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset]\n"
"add x27, x27, %x[row_offset]\n"
"add x26, x26, %x[row_offset]\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset]\n"
"add x24, x24, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset]\n"
"add x22, x22, %x[row_offset]\n"
"add x21, x21, %x[row_offset]\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,60 +79,60 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d23, [x28], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "subs %x[width], %x[width], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "cmp %x[width], #0x8\n"
- "ldr d19, [x24], #0x8\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d17, [x22], #0x8\n"
- "ldr d16, [x21], #0x8\n"
- "sshll v23.8h, v23.8b, #0x0\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
"sshll v25.8h, v25.8b, #0x0\n"
- "sshll v22.8h, v22.8b, #0x0\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "sshll v26.8h, v26.8b, #0x0\n"
+ "sshll v24.8h, v24.8b, #0x0\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"sshll v21.8h, v21.8b, #0x0\n"
+ "sshll v20.8h, v20.8b, #0x0\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
+ "sshll v17.8h, v17.8b, #0x0\n"
+ "sshll v16.8h, v16.8b, #0x0\n"
+ "zip1 v23.8h, v25.8h, v21.8h\n"
+ "zip1 v22.8h, v26.8h, v17.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v19.8h, v27.8h, v20.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "sshll v19.8h, v19.8b, #0x0\n"
- "sshll v18.8h, v18.8b, #0x0\n"
+ "zip2 v25.8h, v25.8h, v21.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "sshll v17.8h, v17.8b, #0x0\n"
- "sshll v16.8h, v16.8b, #0x0\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v24.8h, v23.8h, v22.8h\n"
+ "zip1 v17.8h, v19.8h, v18.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v20.8h, v23.8h, v19.8h\n"
- "zip1 v24.8h, v25.8h, v18.8h\n"
- "zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v25.8h, v25.8h, v18.8h\n"
- "zip1 v19.8h, v22.8h, v17.8h\n"
- "zip1 v18.8h, v21.8h, v16.8h\n"
- "zip2 v22.8h, v22.8h, v17.8h\n"
- "zip2 v17.8h, v21.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v24.8h, v18.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip2 v19.8h, v24.8h, v18.8h\n"
- "zip1 v24.8h, v23.8h, v22.8h\n"
- "zip1 v18.8h, v25.8h, v17.8h\n"
"zip2 v23.8h, v23.8h, v22.8h\n"
- "zip2 v22.8h, v25.8h, v17.8h\n"
- "zip1 v17.8h, v21.8h, v16.8h\n"
- "zip2 v16.8h, v21.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v24.8h, v18.8h\n"
- "zip2 v18.8h, v24.8h, v18.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.8h, v19.8h, v18.8h\n"
+ "zip1 v22.8h, v25.8h, v21.8h\n"
+ "zip1 v18.8h, v20.8h, v16.8h\n"
+ "zip2 v21.8h, v25.8h, v21.8h\n"
+ "zip2 v20.8h, v20.8h, v16.8h\n"
+ "zip1 v16.8h, v24.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v24.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip1 v17.8h, v23.8h, v22.8h\n"
- "zip2 v16.8h, v23.8h, v22.8h\n"
- "str q21, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip1 v19.8h, v22.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v18.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v16.8h, v21.8h, v20.8h\n"
"str q19, [%x[out_ptr], #0x40]\n"
"str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
@@ -227,11 +227,11 @@ void interleave_block<8, 1, VLType::None, false>(
"sshll v24.8h, v24.8b, #0x0\n"
"sshll v23.8h, v23.8b, #0x0\n"
"zip1 v22.8h, v30.8h, v26.8h\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
- "zip1 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
"zip1 v19.8h, v27.8h, v23.8h\n"
- "zip1 v18.8h, v22.8h, v20.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
@@ -241,8 +241,8 @@ void interleave_block<8, 1, VLType::None, false>(
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v22.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
index be6e8980f6..8901908140 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset]\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset]\n"
"add x27, x27, %x[row_offset]\n"
"add x26, x26, %x[row_offset]\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset]\n"
"add x24, x24, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset]\n"
"add x22, x22, %x[row_offset]\n"
"add x21, x21, %x[row_offset]\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,60 +79,60 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d23, [x28], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "subs %x[width], %x[width], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "cmp %x[width], #0x8\n"
- "ldr d19, [x24], #0x8\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d17, [x22], #0x8\n"
- "ldr d16, [x21], #0x8\n"
- "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "ushll v22.8h, v22.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"ushll v21.8h, v21.8b, #0x0\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "zip1 v23.8h, v25.8h, v21.8h\n"
+ "zip1 v22.8h, v26.8h, v17.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v19.8h, v27.8h, v20.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "ushll v18.8h, v18.8b, #0x0\n"
+ "zip2 v25.8h, v25.8h, v21.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ushll v17.8h, v17.8b, #0x0\n"
- "ushll v16.8h, v16.8b, #0x0\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v24.8h, v23.8h, v22.8h\n"
+ "zip1 v17.8h, v19.8h, v18.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v20.8h, v23.8h, v19.8h\n"
- "zip1 v24.8h, v25.8h, v18.8h\n"
- "zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v25.8h, v25.8h, v18.8h\n"
- "zip1 v19.8h, v22.8h, v17.8h\n"
- "zip1 v18.8h, v21.8h, v16.8h\n"
- "zip2 v22.8h, v22.8h, v17.8h\n"
- "zip2 v17.8h, v21.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v24.8h, v18.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip2 v19.8h, v24.8h, v18.8h\n"
- "zip1 v24.8h, v23.8h, v22.8h\n"
- "zip1 v18.8h, v25.8h, v17.8h\n"
"zip2 v23.8h, v23.8h, v22.8h\n"
- "zip2 v22.8h, v25.8h, v17.8h\n"
- "zip1 v17.8h, v21.8h, v16.8h\n"
- "zip2 v16.8h, v21.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v24.8h, v18.8h\n"
- "zip2 v18.8h, v24.8h, v18.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.8h, v19.8h, v18.8h\n"
+ "zip1 v22.8h, v25.8h, v21.8h\n"
+ "zip1 v18.8h, v20.8h, v16.8h\n"
+ "zip2 v21.8h, v25.8h, v21.8h\n"
+ "zip2 v20.8h, v20.8h, v16.8h\n"
+ "zip1 v16.8h, v24.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v24.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip1 v17.8h, v23.8h, v22.8h\n"
- "zip2 v16.8h, v23.8h, v22.8h\n"
- "str q21, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip1 v19.8h, v22.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v18.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v16.8h, v21.8h, v20.8h\n"
"str q19, [%x[out_ptr], #0x40]\n"
"str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
@@ -227,11 +227,11 @@ void interleave_block<8, 1, VLType::None, false>(
"ushll v24.8h, v24.8b, #0x0\n"
"ushll v23.8h, v23.8b, #0x0\n"
"zip1 v22.8h, v30.8h, v26.8h\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
- "zip1 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
"zip1 v19.8h, v27.8h, v23.8h\n"
- "zip1 v18.8h, v22.8h, v20.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
@@ -241,8 +241,8 @@ void interleave_block<8, 1, VLType::None, false>(
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v22.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
index f034b2b45c..db610427a8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 2, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #1\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #1\n"
"add x27, x27, %x[row_offset], LSL #1\n"
"add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #1\n"
"add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #1\n"
"add x22, x22, %x[row_offset], LSL #1\n"
"add x21, x21, %x[row_offset], LSL #1\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,46 +79,46 @@ void interleave_block<8, 2, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q25, [x28], #0x10\n"
- "ldr q24, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q18, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
"cmp %x[width], #0x8\n"
- "ldr q23, [x24], #0x10\n"
- "ldr q22, [x23], #0x10\n"
- "ldr q21, [x22], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "ldr q16, [x25], #0x10\n"
+ "zip1 v25.4s, v20.4s, v17.4s\n"
+ "zip1 v24.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x24], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip2 v22.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x22], #0x10\n"
"ldr q16, [x21], #0x10\n"
- "zip1 v20.4s, v25.4s, v18.4s\n"
- "zip1 v19.4s, v24.4s, v17.4s\n"
- "zip2 v26.4s, v25.4s, v18.4s\n"
- "zip2 v25.4s, v24.4s, v17.4s\n"
+ "zip1 v20.4s, v19.4s, v18.4s\n"
+ "zip1 v17.4s, v23.4s, v16.4s\n"
+ "zip2 v19.4s, v19.4s, v18.4s\n"
+ "zip2 v18.4s, v23.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v18.4s, v23.4s, v21.4s\n"
- "zip1 v17.4s, v22.4s, v16.4s\n"
- "zip2 v24.4s, v23.4s, v21.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v16.4s, v20.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "zip2 v22.4s, v20.4s, v19.4s\n"
- "zip1 v21.4s, v18.4s, v17.4s\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "zip1 v19.4s, v26.4s, v25.4s\n"
- "zip1 v18.4s, v24.4s, v23.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v17.4s, v26.4s, v25.4s\n"
- "zip2 v16.4s, v24.4s, v23.4s\n"
- "str q21, [%x[out_ptr], #0x10]\n"
- "str q22, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
- "str q19, [%x[out_ptr], #0x40]\n"
- "str q18, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v20.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -126,119 +126,119 @@ void interleave_block<8, 2, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr d29, [x28], #0x8\n"
- "ldr d28, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d25, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d28, [x28], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v29.s }[2], [x28], #0x4\n"
- "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v28.s }[2], [x28], #0x4\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
"mov x20, #0x3\n"
- "ld1 { v27.s }[2], [x26], #0x4\n"
- "ld1 { v26.s }[2], [x25], #0x4\n"
- "ld1 { v25.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v23.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v21.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v29.h }[6], [x28]\n"
- "ld1 { v28.h }[6], [x27]\n"
+ "ld1 { v28.h }[6], [x28]\n"
+ "ld1 { v27.h }[6], [x27]\n"
"mov x20, #0x4\n"
- "ld1 { v27.h }[6], [x26]\n"
- "ld1 { v26.h }[6], [x25]\n"
- "ld1 { v25.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v23.h }[6], [x22]\n"
- "ld1 { v22.h }[6], [x21]\n"
+ "ld1 { v26.h }[6], [x26]\n"
+ "ld1 { v25.h }[6], [x25]\n"
+ "ld1 { v24.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v22.h }[6], [x22]\n"
+ "ld1 { v21.h }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v29.h }[4], [x28]\n"
- "ld1 { v28.h }[4], [x27]\n"
+ "ld1 { v28.h }[4], [x28]\n"
+ "ld1 { v27.h }[4], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v27.h }[4], [x26]\n"
- "ld1 { v26.h }[4], [x25]\n"
- "ld1 { v25.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v23.h }[4], [x22]\n"
- "ld1 { v22.h }[4], [x21]\n"
+ "ld1 { v26.h }[4], [x26]\n"
+ "ld1 { v25.h }[4], [x25]\n"
+ "ld1 { v24.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v22.h }[4], [x22]\n"
+ "ld1 { v21.h }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr s29, [x28], #0x4\n"
- "ldr s28, [x27], #0x4\n"
+ "ldr s28, [x28], #0x4\n"
+ "ldr s27, [x27], #0x4\n"
"mov x20, #0x1\n"
- "ldr s27, [x26], #0x4\n"
- "ldr s26, [x25], #0x4\n"
- "ldr s25, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s23, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s24, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v29.h }[2], [x28]\n"
- "ld1 { v28.h }[2], [x27]\n"
+ "ld1 { v28.h }[2], [x28]\n"
+ "ld1 { v27.h }[2], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v27.h }[2], [x26]\n"
- "ld1 { v26.h }[2], [x25]\n"
- "ld1 { v25.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v23.h }[2], [x22]\n"
- "ld1 { v22.h }[2], [x21]\n"
+ "ld1 { v26.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v21.h }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr h29, [x28, #0x0]\n"
- "ldr h28, [x27, #0x0]\n"
+ "ldr h28, [x28, #0x0]\n"
+ "ldr h27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h27, [x26, #0x0]\n"
- "ldr h26, [x25, #0x0]\n"
- "ldr h25, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h23, [x22, #0x0]\n"
- "ldr h22, [x21, #0x0]\n"
+ "ldr h26, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
+ "ldr h24, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h21, [x21, #0x0]\n"
"7:" // Odd load end
- "zip1 v21.4s, v29.4s, v27.4s\n"
"zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v19.4s, v25.4s, v23.4s\n"
"zip1 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v21.4s, v20.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v21.4s, v29.4s, v27.4s\n"
"zip2 v20.4s, v28.4s, v26.4s\n"
+ "zip2 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip2 v19.4s, v25.4s, v23.4s\n"
"zip2 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v21.4s, v20.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"8:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
index a57810ce20..b3a52451e8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 2, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #2\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #2\n"
"add x27, x27, %x[row_offset], LSL #2\n"
"add x26, x26, %x[row_offset], LSL #2\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #2\n"
"add x24, x24, %x[row_offset], LSL #2\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #2\n"
"add x22, x22, %x[row_offset], LSL #2\n"
"add x21, x21, %x[row_offset], LSL #2\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,100 +79,100 @@ void interleave_block<8, 2, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q21, [x28], #0x10\n"
- "ldr q16, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q19, [x27], #0x10\n"
"subs %x[width], %x[width], #0x4\n"
- "ldr q20, [x26], #0x10\n"
- "ldr q19, [x25], #0x10\n"
"cmp %x[width], #0x4\n"
- "ldr q24, [x24], #0x10\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q23, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "zip1 v17.2d, v21.2d, v16.2d\n"
- "zip2 v21.2d, v21.2d, v16.2d\n"
+ "ldr q25, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
+ "zip1 v18.2d, v25.2d, v24.2d\n"
+ "ldr q23, [x24], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip1 v17.2d, v23.2d, v22.2d\n"
+ "zip2 v21.2d, v20.2d, v19.2d\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q19, [x21], #0x10\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip1 v16.2d, v20.2d, v19.2d\n"
- "zip2 v20.2d, v20.2d, v19.2d\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.2d, v25.2d, v24.2d\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v19.2d, v24.2d, v18.2d\n"
- "zip2 v18.2d, v24.2d, v18.2d\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v17.2d, v23.2d, v22.2d\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v16.2d, v20.2d, v19.2d\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v16.2d, v23.2d, v22.2d\n"
- "str q19, [%x[out_ptr], #0x20]\n"
- "str q17, [%x[out_ptr], #0x30]\n"
"str q21, [%x[out_ptr], #0x40]\n"
- "str q20, [%x[out_ptr], #0x50]\n"
- "str q18, [%x[out_ptr], #0x60]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr d27, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d24, [x27], #0x8\n"
"mov x20, #0x1\n"
- "ldr d25, [x26], #0x8\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d20, [x21], #0x8\n"
+ "ldr d23, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v27.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
+ "ld1 { v25.s }[2], [x28]\n"
+ "ld1 { v24.s }[2], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v25.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v23.s }[2], [x26]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v18.s }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr s27, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
+ "ldr s25, [x28, #0x0]\n"
+ "ldr s24, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr s25, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "ldr s20, [x21, #0x0]\n"
+ "ldr s23, [x26, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s21, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s18, [x21, #0x0]\n"
"5:" // Odd load end
"subs x20, x20, #0x1\n"
- "zip1 v19.2d, v27.2d, v26.2d\n"
- "zip1 v18.2d, v25.2d, v24.2d\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
- "zip1 v16.2d, v21.2d, v20.2d\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip1 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v21.2d, v20.2d\n"
+ "zip1 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 6f\n"
- "zip2 v19.2d, v27.2d, v26.2d\n"
- "zip2 v18.2d, v25.2d, v24.2d\n"
- "zip2 v17.2d, v23.2d, v22.2d\n"
- "zip2 v16.2d, v21.2d, v20.2d\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.2d, v21.2d, v20.2d\n"
+ "zip2 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"6:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
index edc1375b02..33639c201e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 4, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #1\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #1\n"
"add x27, x27, %x[row_offset], LSL #1\n"
"add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #1\n"
"add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #1\n"
"add x22, x22, %x[row_offset], LSL #1\n"
"add x21, x21, %x[row_offset], LSL #1\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,142 +79,142 @@ void interleave_block<8, 4, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q21, [x28], #0x10\n"
- "ldr q16, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q19, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
- "ldr q20, [x26], #0x10\n"
- "ldr q19, [x25], #0x10\n"
"cmp %x[width], #0x8\n"
- "ldr q24, [x24], #0x10\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q23, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "zip1 v17.2d, v21.2d, v16.2d\n"
- "zip2 v21.2d, v21.2d, v16.2d\n"
+ "ldr q25, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
+ "zip1 v18.2d, v25.2d, v24.2d\n"
+ "ldr q23, [x24], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip1 v17.2d, v23.2d, v22.2d\n"
+ "zip2 v21.2d, v20.2d, v19.2d\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q19, [x21], #0x10\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip1 v16.2d, v20.2d, v19.2d\n"
- "zip2 v20.2d, v20.2d, v19.2d\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.2d, v25.2d, v24.2d\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v19.2d, v24.2d, v18.2d\n"
- "zip2 v18.2d, v24.2d, v18.2d\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v17.2d, v23.2d, v22.2d\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v16.2d, v20.2d, v19.2d\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v16.2d, v23.2d, v22.2d\n"
- "str q19, [%x[out_ptr], #0x20]\n"
- "str q17, [%x[out_ptr], #0x30]\n"
"str q21, [%x[out_ptr], #0x40]\n"
- "str q20, [%x[out_ptr], #0x50]\n"
- "str q18, [%x[out_ptr], #0x60]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr d27, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d25, [x26], #0x8\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d20, [x21], #0x8\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d24, [x27], #0x8\n"
+ "ldr d23, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v27.s }[2], [x28], #0x4\n"
- "ld1 { v26.s }[2], [x27], #0x4\n"
+ "ld1 { v25.s }[2], [x28], #0x4\n"
+ "ld1 { v24.s }[2], [x27], #0x4\n"
"mov x20, #0x2\n"
- "ld1 { v25.s }[2], [x26], #0x4\n"
- "ld1 { v24.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
- "ld1 { v21.s }[2], [x22], #0x4\n"
- "ld1 { v20.s }[2], [x21], #0x4\n"
+ "ld1 { v23.s }[2], [x26], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v27.h }[6], [x28]\n"
- "ld1 { v26.h }[6], [x27]\n"
- "ld1 { v25.h }[6], [x26]\n"
- "ld1 { v24.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v22.h }[6], [x23]\n"
- "ld1 { v21.h }[6], [x22]\n"
- "ld1 { v20.h }[6], [x21]\n"
+ "ld1 { v25.h }[6], [x28]\n"
+ "ld1 { v24.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x26]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v19.h }[6], [x22]\n"
+ "ld1 { v18.h }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v27.h }[4], [x28]\n"
- "ld1 { v26.h }[4], [x27]\n"
+ "ld1 { v25.h }[4], [x28]\n"
+ "ld1 { v24.h }[4], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v25.h }[4], [x26]\n"
- "ld1 { v24.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v22.h }[4], [x23]\n"
- "ld1 { v21.h }[4], [x22]\n"
- "ld1 { v20.h }[4], [x21]\n"
+ "ld1 { v23.h }[4], [x26]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v19.h }[4], [x22]\n"
+ "ld1 { v18.h }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr s27, [x28], #0x4\n"
- "ldr s26, [x27], #0x4\n"
+ "ldr s25, [x28], #0x4\n"
+ "ldr s24, [x27], #0x4\n"
"mov x20, #0x1\n"
- "ldr s25, [x26], #0x4\n"
- "ldr s24, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s22, [x23], #0x4\n"
- "ldr s21, [x22], #0x4\n"
- "ldr s20, [x21], #0x4\n"
+ "ldr s23, [x26], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v27.h }[2], [x28]\n"
- "ld1 { v26.h }[2], [x27]\n"
- "ld1 { v25.h }[2], [x26]\n"
- "ld1 { v24.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v22.h }[2], [x23]\n"
- "ld1 { v21.h }[2], [x22]\n"
- "ld1 { v20.h }[2], [x21]\n"
+ "ld1 { v25.h }[2], [x28]\n"
+ "ld1 { v24.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x26]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v19.h }[2], [x22]\n"
+ "ld1 { v18.h }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr h27, [x28, #0x0]\n"
- "ldr h26, [x27, #0x0]\n"
+ "ldr h25, [x28, #0x0]\n"
+ "ldr h24, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h25, [x26, #0x0]\n"
- "ldr h24, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h22, [x23, #0x0]\n"
- "ldr h21, [x22, #0x0]\n"
- "ldr h20, [x21, #0x0]\n"
+ "ldr h23, [x26, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "ldr h21, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h19, [x22, #0x0]\n"
+ "ldr h18, [x21, #0x0]\n"
"7:" // Odd load end
"subs x20, x20, #0x1\n"
- "zip1 v19.2d, v27.2d, v26.2d\n"
- "zip1 v18.2d, v25.2d, v24.2d\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
- "zip1 v16.2d, v21.2d, v20.2d\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip1 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v21.2d, v20.2d\n"
+ "zip1 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 8f\n"
- "zip2 v19.2d, v27.2d, v26.2d\n"
- "zip2 v18.2d, v25.2d, v24.2d\n"
- "zip2 v17.2d, v23.2d, v22.2d\n"
- "zip2 v16.2d, v21.2d, v20.2d\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.2d, v21.2d, v20.2d\n"
+ "zip2 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"8:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
index ef1493b605..3044cfde48 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 4, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset], LSL #2\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset], LSL #2\n"
"add x27, x27, %x[row_offset], LSL #2\n"
"add x26, x26, %x[row_offset], LSL #2\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset], LSL #2\n"
"add x24, x24, %x[row_offset], LSL #2\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset], LSL #2\n"
"add x22, x22, %x[row_offset], LSL #2\n"
"add x21, x21, %x[row_offset], LSL #2\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,36 +79,36 @@ void interleave_block<8, 4, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q19, [x28], #0x10\n"
- "ldr q18, [x26], #0x10\n"
- "subs %x[width], %x[width], #0x4\n"
+ "ldr q17, [x28], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ ".inst 0x0ea16a37 // bfcvtn v23.4h, v17.4s\n"
+ ".inst 0x0ea16a16 // bfcvtn v22.4h, v16.4s\n"
"ldr q17, [x24], #0x10\n"
"ldr q16, [x22], #0x10\n"
+ ".inst 0x0ea16a35 // bfcvtn v21.4h, v17.4s\n"
+ ".inst 0x0ea16a14 // bfcvtn v20.4h, v16.4s\n"
+ "ldr q19, [x27], #0x10\n"
+ "ldr q18, [x25], #0x10\n"
+ "subs %x[width], %x[width], #0x4\n"
"cmp %x[width], #0x4\n"
- "ldr q23, [x27], #0x10\n"
- "ldr q22, [x25], #0x10\n"
- "ldr q21, [x23], #0x10\n"
- "ldr q20, [x21], #0x10\n"
- ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
- ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
- ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
- ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x21], #0x10\n"
+ ".inst 0x4ea16a77 // bfcvtn2 v23.8h, v19.4s\n"
+ ".inst 0x4ea16a56 // bfcvtn2 v22.8h, v18.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
+ ".inst 0x4ea16a35 // bfcvtn2 v21.8h, v17.4s\n"
+ ".inst 0x4ea16a14 // bfcvtn2 v20.8h, v16.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- ".inst 0x4ea16af3 // bfcvtn2 v19.8h, v23.4s\n"
- ".inst 0x4ea16ad2 // bfcvtn2 v18.8h, v22.4s\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- ".inst 0x4ea16ab1 // bfcvtn2 v17.8h, v21.4s\n"
- ".inst 0x4ea16a90 // bfcvtn2 v16.8h, v20.4s\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "str q17, [%x[out_ptr], #0x20]\n"
- "str q16, [%x[out_ptr], #0x30]\n"
+ "str q21, [%x[out_ptr], #0x20]\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"bge 2b\n"
"3:" // Main loop skip
@@ -150,9 +150,9 @@ void interleave_block<8, 4, VLType::None, false>(
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
".inst 0x4ea16af3 // bfcvtn2 v19.8h, v23.4s\n"
".inst 0x4ea16ad2 // bfcvtn2 v18.8h, v22.4s\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
".inst 0x4ea16ab1 // bfcvtn2 v17.8h, v21.4s\n"
".inst 0x4ea16a90 // bfcvtn2 v16.8h, v20.4s\n"
- "str q19, [%x[out_ptr], #0x0]\n"
"str q18, [%x[out_ptr], #0x10]\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
index ad213db3e5..8bb9898861 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 4, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset]\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset]\n"
"add x27, x27, %x[row_offset]\n"
"add x26, x26, %x[row_offset]\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset]\n"
"add x24, x24, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset]\n"
"add x22, x22, %x[row_offset]\n"
"add x21, x21, %x[row_offset]\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,46 +79,46 @@ void interleave_block<8, 4, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q25, [x28], #0x10\n"
- "ldr q24, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q18, [x27], #0x10\n"
"subs %x[width], %x[width], #0x10\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
"cmp %x[width], #0x10\n"
- "ldr q23, [x24], #0x10\n"
- "ldr q22, [x23], #0x10\n"
- "ldr q21, [x22], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "ldr q16, [x25], #0x10\n"
+ "zip1 v25.4s, v20.4s, v17.4s\n"
+ "zip1 v24.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x24], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip2 v22.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x22], #0x10\n"
"ldr q16, [x21], #0x10\n"
- "zip1 v20.4s, v25.4s, v18.4s\n"
- "zip1 v19.4s, v24.4s, v17.4s\n"
- "zip2 v26.4s, v25.4s, v18.4s\n"
- "zip2 v25.4s, v24.4s, v17.4s\n"
+ "zip1 v20.4s, v19.4s, v18.4s\n"
+ "zip1 v17.4s, v23.4s, v16.4s\n"
+ "zip2 v19.4s, v19.4s, v18.4s\n"
+ "zip2 v18.4s, v23.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v18.4s, v23.4s, v21.4s\n"
- "zip1 v17.4s, v22.4s, v16.4s\n"
- "zip2 v24.4s, v23.4s, v21.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v16.4s, v20.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "zip2 v22.4s, v20.4s, v19.4s\n"
- "zip1 v21.4s, v18.4s, v17.4s\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "zip1 v19.4s, v26.4s, v25.4s\n"
- "zip1 v18.4s, v24.4s, v23.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v17.4s, v26.4s, v25.4s\n"
- "zip2 v16.4s, v24.4s, v23.4s\n"
- "str q21, [%x[out_ptr], #0x10]\n"
- "str q22, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
- "str q19, [%x[out_ptr], #0x40]\n"
- "str q18, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v20.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v22.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -126,203 +126,203 @@ void interleave_block<8, 4, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 12f\n"
"tbz %x[width], #3, 7f\n"
- "ldr d29, [x28], #0x8\n"
- "ldr d28, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d25, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d28, [x28], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
"tbz %x[width], #2, 5f\n"
- "ld1 { v29.s }[2], [x28], #0x4\n"
- "ld1 { v28.s }[2], [x27], #0x4\n"
- "ld1 { v27.s }[2], [x26], #0x4\n"
- "ld1 { v26.s }[2], [x25], #0x4\n"
- "ld1 { v25.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v23.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v28.s }[2], [x28], #0x4\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v21.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v29.h }[6], [x28], #0x2\n"
- "ld1 { v28.h }[6], [x27], #0x2\n"
+ "ld1 { v28.h }[6], [x28], #0x2\n"
+ "ld1 { v27.h }[6], [x27], #0x2\n"
"mov x20, #0x4\n"
- "ld1 { v27.h }[6], [x26], #0x2\n"
- "ld1 { v26.h }[6], [x25], #0x2\n"
- "ld1 { v25.h }[6], [x24], #0x2\n"
- "ld1 { v24.h }[6], [x23], #0x2\n"
- "ld1 { v23.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v26.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
+ "ld1 { v24.h }[6], [x24], #0x2\n"
+ "ld1 { v23.h }[6], [x23], #0x2\n"
+ "ld1 { v22.h }[6], [x22], #0x2\n"
+ "ld1 { v21.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[14], [x28]\n"
- "ld1 { v28.b }[14], [x27]\n"
- "ld1 { v27.b }[14], [x26]\n"
- "ld1 { v26.b }[14], [x25]\n"
- "ld1 { v25.b }[14], [x24]\n"
- "ld1 { v24.b }[14], [x23]\n"
- "ld1 { v23.b }[14], [x22]\n"
- "ld1 { v22.b }[14], [x21]\n"
+ "ld1 { v28.b }[14], [x28]\n"
+ "ld1 { v27.b }[14], [x27]\n"
+ "ld1 { v26.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
+ "ld1 { v24.b }[14], [x24]\n"
+ "ld1 { v23.b }[14], [x23]\n"
+ "ld1 { v22.b }[14], [x22]\n"
+ "ld1 { v21.b }[14], [x21]\n"
"b 11f\n"
"4:" // odd_loads_1_12
"mov x20, #0x3\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[12], [x28]\n"
- "ld1 { v28.b }[12], [x27]\n"
+ "ld1 { v28.b }[12], [x28]\n"
+ "ld1 { v27.b }[12], [x27]\n"
"mov x20, #0x4\n"
- "ld1 { v27.b }[12], [x26]\n"
- "ld1 { v26.b }[12], [x25]\n"
- "ld1 { v25.b }[12], [x24]\n"
- "ld1 { v24.b }[12], [x23]\n"
- "ld1 { v23.b }[12], [x22]\n"
- "ld1 { v22.b }[12], [x21]\n"
+ "ld1 { v26.b }[12], [x26]\n"
+ "ld1 { v25.b }[12], [x25]\n"
+ "ld1 { v24.b }[12], [x24]\n"
+ "ld1 { v23.b }[12], [x23]\n"
+ "ld1 { v22.b }[12], [x22]\n"
+ "ld1 { v21.b }[12], [x21]\n"
"b 11f\n"
"5:" // odd_loads_2_8
"tbz %x[width], #1, 6f\n"
- "ld1 { v29.h }[4], [x28], #0x2\n"
- "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v28.h }[4], [x28], #0x2\n"
+ "ld1 { v27.h }[4], [x27], #0x2\n"
"mov x20, #0x3\n"
- "ld1 { v27.h }[4], [x26], #0x2\n"
- "ld1 { v26.h }[4], [x25], #0x2\n"
- "ld1 { v25.h }[4], [x24], #0x2\n"
- "ld1 { v24.h }[4], [x23], #0x2\n"
- "ld1 { v23.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v26.h }[4], [x26], #0x2\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
+ "ld1 { v24.h }[4], [x24], #0x2\n"
+ "ld1 { v23.h }[4], [x23], #0x2\n"
+ "ld1 { v22.h }[4], [x22], #0x2\n"
+ "ld1 { v21.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[10], [x28]\n"
- "ld1 { v28.b }[10], [x27]\n"
- "ld1 { v27.b }[10], [x26]\n"
- "ld1 { v26.b }[10], [x25]\n"
- "ld1 { v25.b }[10], [x24]\n"
- "ld1 { v24.b }[10], [x23]\n"
- "ld1 { v23.b }[10], [x22]\n"
- "ld1 { v22.b }[10], [x21]\n"
+ "ld1 { v28.b }[10], [x28]\n"
+ "ld1 { v27.b }[10], [x27]\n"
+ "ld1 { v26.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
+ "ld1 { v24.b }[10], [x24]\n"
+ "ld1 { v23.b }[10], [x23]\n"
+ "ld1 { v22.b }[10], [x22]\n"
+ "ld1 { v21.b }[10], [x21]\n"
"b 11f\n"
"6:" // odd_loads_1_8
"mov x20, #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[8], [x28]\n"
- "ld1 { v28.b }[8], [x27]\n"
+ "ld1 { v28.b }[8], [x28]\n"
+ "ld1 { v27.b }[8], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v27.b }[8], [x26]\n"
- "ld1 { v26.b }[8], [x25]\n"
- "ld1 { v25.b }[8], [x24]\n"
- "ld1 { v24.b }[8], [x23]\n"
- "ld1 { v23.b }[8], [x22]\n"
- "ld1 { v22.b }[8], [x21]\n"
+ "ld1 { v26.b }[8], [x26]\n"
+ "ld1 { v25.b }[8], [x25]\n"
+ "ld1 { v24.b }[8], [x24]\n"
+ "ld1 { v23.b }[8], [x23]\n"
+ "ld1 { v22.b }[8], [x22]\n"
+ "ld1 { v21.b }[8], [x21]\n"
"b 11f\n"
"7:" // odd_loads_4_0
"tbz %x[width], #2, 9f\n"
- "ldr s29, [x28], #0x4\n"
- "ldr s28, [x27], #0x4\n"
- "ldr s27, [x26], #0x4\n"
- "ldr s26, [x25], #0x4\n"
- "ldr s25, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s23, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s28, [x28], #0x4\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s24, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #1, 8f\n"
- "ld1 { v29.h }[2], [x28], #0x2\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v28.h }[2], [x28], #0x2\n"
+ "ld1 { v27.h }[2], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "ld1 { v26.h }[2], [x25], #0x2\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v22.h }[2], [x22], #0x2\n"
+ "ld1 { v21.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[6], [x28]\n"
- "ld1 { v28.b }[6], [x27]\n"
- "ld1 { v27.b }[6], [x26]\n"
- "ld1 { v26.b }[6], [x25]\n"
- "ld1 { v25.b }[6], [x24]\n"
- "ld1 { v24.b }[6], [x23]\n"
- "ld1 { v23.b }[6], [x22]\n"
- "ld1 { v22.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x28]\n"
+ "ld1 { v27.b }[6], [x27]\n"
+ "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v22.b }[6], [x22]\n"
+ "ld1 { v21.b }[6], [x21]\n"
"b 11f\n"
"8:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[4], [x28]\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v28.b }[4], [x28]\n"
+ "ld1 { v27.b }[4], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v27.b }[4], [x26]\n"
- "ld1 { v26.b }[4], [x25]\n"
- "ld1 { v25.b }[4], [x24]\n"
- "ld1 { v24.b }[4], [x23]\n"
- "ld1 { v23.b }[4], [x22]\n"
- "ld1 { v22.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v22.b }[4], [x22]\n"
+ "ld1 { v21.b }[4], [x21]\n"
"b 11f\n"
"9:" // odd_loads_2_0
"tbz %x[width], #1, 10f\n"
- "ldr h29, [x28], #0x2\n"
- "ldr h28, [x27], #0x2\n"
+ "ldr h28, [x28], #0x2\n"
+ "ldr h27, [x27], #0x2\n"
"mov x20, #0x1\n"
- "ldr h27, [x26], #0x2\n"
- "ldr h26, [x25], #0x2\n"
- "ldr h25, [x24], #0x2\n"
- "ldr h24, [x23], #0x2\n"
- "ldr h23, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h26, [x26], #0x2\n"
+ "ldr h25, [x25], #0x2\n"
+ "ldr h24, [x24], #0x2\n"
+ "ldr h23, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
+ "ldr h21, [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v29.b }[2], [x28]\n"
- "ld1 { v28.b }[2], [x27]\n"
- "ld1 { v27.b }[2], [x26]\n"
- "ld1 { v26.b }[2], [x25]\n"
- "ld1 { v25.b }[2], [x24]\n"
- "ld1 { v24.b }[2], [x23]\n"
- "ld1 { v23.b }[2], [x22]\n"
- "ld1 { v22.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x28]\n"
+ "ld1 { v27.b }[2], [x27]\n"
+ "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v22.b }[2], [x22]\n"
+ "ld1 { v21.b }[2], [x21]\n"
"b 11f\n"
"10:" // odd_loads_1_0
- "ldr b29, [x28, #0x0]\n"
- "ldr b28, [x27, #0x0]\n"
+ "ldr b28, [x28, #0x0]\n"
+ "ldr b27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b27, [x26, #0x0]\n"
- "ldr b26, [x25, #0x0]\n"
- "ldr b25, [x24, #0x0]\n"
- "ldr b24, [x23, #0x0]\n"
- "ldr b23, [x22, #0x0]\n"
- "ldr b22, [x21, #0x0]\n"
+ "ldr b26, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
+ "ldr b24, [x24, #0x0]\n"
+ "ldr b23, [x23, #0x0]\n"
+ "ldr b22, [x22, #0x0]\n"
+ "ldr b21, [x21, #0x0]\n"
"11:" // Odd load end
- "zip1 v21.4s, v29.4s, v27.4s\n"
"zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v19.4s, v25.4s, v23.4s\n"
"zip1 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v21.4s, v20.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
"subs x20, x20, #0x1\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v21.4s, v29.4s, v27.4s\n"
"zip2 v20.4s, v28.4s, v26.4s\n"
+ "zip2 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip2 v19.4s, v25.4s, v23.4s\n"
"zip2 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v21.4s, v20.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"12:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
index 28d2196ade..6e1efa3814 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -34,29 +34,29 @@ void interleave_block<8, 8, VLType::None, false>(
"ldr x28, [%x[in], #0x0]\n"
"ldr x27, [%x[in], #0x8]\n"
"cmp %x[height], #0x8\n"
+ "add x28, x28, %x[row_offset]\n"
"ldr x26, [%x[in], #0x10]\n"
"ldr x25, [%x[in], #0x18]\n"
- "ldr x24, [%x[in], #0x20]\n"
- "ldr x23, [%x[in], #0x28]\n"
- "ldr x22, [%x[in], #0x30]\n"
- "ldr x21, [%x[in], #0x38]\n"
- "add x28, x28, %x[row_offset]\n"
"add x27, x27, %x[row_offset]\n"
"add x26, x26, %x[row_offset]\n"
+ "ldr x24, [%x[in], #0x20]\n"
+ "ldr x23, [%x[in], #0x28]\n"
"add x25, x25, %x[row_offset]\n"
"add x24, x24, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x30]\n"
+ "ldr x21, [%x[in], #0x38]\n"
"add x23, x23, %x[row_offset]\n"
"add x22, x22, %x[row_offset]\n"
"add x21, x21, %x[row_offset]\n"
"beq 1f\n"
"cmp %x[height], #0x2\n"
- "mov x21, x28\n"
"csel x27, x27, x28, GE\n"
"csel x26, x26, x28, GT\n"
"cmp %x[height], #0x4\n"
"csel x25, x25, x28, GE\n"
"csel x24, x24, x28, GT\n"
"cmp %x[height], #0x6\n"
+ "mov x21, x28\n"
"csel x23, x23, x28, GE\n"
"csel x22, x22, x28, GT\n"
"1:" // no_pointer_adj
@@ -79,226 +79,226 @@ void interleave_block<8, 8, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q21, [x28], #0x10\n"
- "ldr q16, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q19, [x27], #0x10\n"
"subs %x[width], %x[width], #0x10\n"
- "ldr q20, [x26], #0x10\n"
- "ldr q19, [x25], #0x10\n"
"cmp %x[width], #0x10\n"
- "ldr q24, [x24], #0x10\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q23, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "zip1 v17.2d, v21.2d, v16.2d\n"
- "zip2 v21.2d, v21.2d, v16.2d\n"
+ "ldr q25, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
+ "zip1 v18.2d, v25.2d, v24.2d\n"
+ "ldr q23, [x24], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip1 v17.2d, v23.2d, v22.2d\n"
+ "zip2 v21.2d, v20.2d, v19.2d\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q19, [x21], #0x10\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip1 v16.2d, v20.2d, v19.2d\n"
- "zip2 v20.2d, v20.2d, v19.2d\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.2d, v25.2d, v24.2d\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v19.2d, v24.2d, v18.2d\n"
- "zip2 v18.2d, v24.2d, v18.2d\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v17.2d, v23.2d, v22.2d\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v16.2d, v20.2d, v19.2d\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v16.2d, v23.2d, v22.2d\n"
- "str q19, [%x[out_ptr], #0x20]\n"
- "str q17, [%x[out_ptr], #0x30]\n"
"str q21, [%x[out_ptr], #0x40]\n"
- "str q20, [%x[out_ptr], #0x50]\n"
- "str q18, [%x[out_ptr], #0x60]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
"cbz %x[width], 12f\n"
"tbz %x[width], #3, 7f\n"
- "ldr d27, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d25, [x26], #0x8\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d20, [x21], #0x8\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d24, [x27], #0x8\n"
+ "ldr d23, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
"tbz %x[width], #2, 5f\n"
- "ld1 { v27.s }[2], [x28], #0x4\n"
- "ld1 { v26.s }[2], [x27], #0x4\n"
- "ld1 { v25.s }[2], [x26], #0x4\n"
- "ld1 { v24.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
- "ld1 { v21.s }[2], [x22], #0x4\n"
- "ld1 { v20.s }[2], [x21], #0x4\n"
+ "ld1 { v25.s }[2], [x28], #0x4\n"
+ "ld1 { v24.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x26], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v27.h }[6], [x28], #0x2\n"
- "ld1 { v26.h }[6], [x27], #0x2\n"
+ "ld1 { v25.h }[6], [x28], #0x2\n"
+ "ld1 { v24.h }[6], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v25.h }[6], [x26], #0x2\n"
- "ld1 { v24.h }[6], [x25], #0x2\n"
- "ld1 { v23.h }[6], [x24], #0x2\n"
- "ld1 { v22.h }[6], [x23], #0x2\n"
- "ld1 { v21.h }[6], [x22], #0x2\n"
- "ld1 { v20.h }[6], [x21], #0x2\n"
+ "ld1 { v23.h }[6], [x26], #0x2\n"
+ "ld1 { v22.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "ld1 { v18.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[14], [x28]\n"
- "ld1 { v26.b }[14], [x27]\n"
- "ld1 { v25.b }[14], [x26]\n"
- "ld1 { v24.b }[14], [x25]\n"
- "ld1 { v23.b }[14], [x24]\n"
- "ld1 { v22.b }[14], [x23]\n"
- "ld1 { v21.b }[14], [x22]\n"
- "ld1 { v20.b }[14], [x21]\n"
+ "ld1 { v25.b }[14], [x28]\n"
+ "ld1 { v24.b }[14], [x27]\n"
+ "ld1 { v23.b }[14], [x26]\n"
+ "ld1 { v22.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v20.b }[14], [x23]\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "ld1 { v18.b }[14], [x21]\n"
"b 11f\n"
"4:" // odd_loads_1_12
"mov x20, #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[12], [x28]\n"
- "ld1 { v26.b }[12], [x27]\n"
- "ld1 { v25.b }[12], [x26]\n"
- "ld1 { v24.b }[12], [x25]\n"
- "ld1 { v23.b }[12], [x24]\n"
- "ld1 { v22.b }[12], [x23]\n"
- "ld1 { v21.b }[12], [x22]\n"
- "ld1 { v20.b }[12], [x21]\n"
+ "ld1 { v25.b }[12], [x28]\n"
+ "ld1 { v24.b }[12], [x27]\n"
+ "ld1 { v23.b }[12], [x26]\n"
+ "ld1 { v22.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v20.b }[12], [x23]\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "ld1 { v18.b }[12], [x21]\n"
"b 11f\n"
"5:" // odd_loads_2_8
"tbz %x[width], #1, 6f\n"
- "ld1 { v27.h }[4], [x28], #0x2\n"
- "ld1 { v26.h }[4], [x27], #0x2\n"
+ "ld1 { v25.h }[4], [x28], #0x2\n"
+ "ld1 { v24.h }[4], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v25.h }[4], [x26], #0x2\n"
- "ld1 { v24.h }[4], [x25], #0x2\n"
- "ld1 { v23.h }[4], [x24], #0x2\n"
- "ld1 { v22.h }[4], [x23], #0x2\n"
- "ld1 { v21.h }[4], [x22], #0x2\n"
- "ld1 { v20.h }[4], [x21], #0x2\n"
+ "ld1 { v23.h }[4], [x26], #0x2\n"
+ "ld1 { v22.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "ld1 { v18.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[10], [x28]\n"
- "ld1 { v26.b }[10], [x27]\n"
- "ld1 { v25.b }[10], [x26]\n"
- "ld1 { v24.b }[10], [x25]\n"
- "ld1 { v23.b }[10], [x24]\n"
- "ld1 { v22.b }[10], [x23]\n"
- "ld1 { v21.b }[10], [x22]\n"
- "ld1 { v20.b }[10], [x21]\n"
+ "ld1 { v25.b }[10], [x28]\n"
+ "ld1 { v24.b }[10], [x27]\n"
+ "ld1 { v23.b }[10], [x26]\n"
+ "ld1 { v22.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v20.b }[10], [x23]\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "ld1 { v18.b }[10], [x21]\n"
"b 11f\n"
"6:" // odd_loads_1_8
"mov x20, #0x1\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[8], [x28]\n"
- "ld1 { v26.b }[8], [x27]\n"
+ "ld1 { v25.b }[8], [x28]\n"
+ "ld1 { v24.b }[8], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v25.b }[8], [x26]\n"
- "ld1 { v24.b }[8], [x25]\n"
- "ld1 { v23.b }[8], [x24]\n"
- "ld1 { v22.b }[8], [x23]\n"
- "ld1 { v21.b }[8], [x22]\n"
- "ld1 { v20.b }[8], [x21]\n"
+ "ld1 { v23.b }[8], [x26]\n"
+ "ld1 { v22.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v20.b }[8], [x23]\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "ld1 { v18.b }[8], [x21]\n"
"b 11f\n"
"7:" // odd_loads_4_0
"tbz %x[width], #2, 9f\n"
- "ldr s27, [x28], #0x4\n"
- "ldr s26, [x27], #0x4\n"
- "ldr s25, [x26], #0x4\n"
- "ldr s24, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s22, [x23], #0x4\n"
- "ldr s21, [x22], #0x4\n"
- "ldr s20, [x21], #0x4\n"
+ "ldr s25, [x28], #0x4\n"
+ "ldr s24, [x27], #0x4\n"
+ "ldr s23, [x26], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
"tbz %x[width], #1, 8f\n"
- "ld1 { v27.h }[2], [x28], #0x2\n"
- "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x28], #0x2\n"
+ "ld1 { v24.h }[2], [x27], #0x2\n"
"mov x20, #0x1\n"
- "ld1 { v25.h }[2], [x26], #0x2\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v22.h }[2], [x23], #0x2\n"
- "ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v20.h }[2], [x21], #0x2\n"
+ "ld1 { v23.h }[2], [x26], #0x2\n"
+ "ld1 { v22.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v18.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[6], [x28]\n"
- "ld1 { v26.b }[6], [x27]\n"
- "ld1 { v25.b }[6], [x26]\n"
- "ld1 { v24.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
- "ld1 { v22.b }[6], [x23]\n"
- "ld1 { v21.b }[6], [x22]\n"
- "ld1 { v20.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x28]\n"
+ "ld1 { v24.b }[6], [x27]\n"
+ "ld1 { v23.b }[6], [x26]\n"
+ "ld1 { v22.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v20.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v18.b }[6], [x21]\n"
"b 11f\n"
"8:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[4], [x28]\n"
- "ld1 { v26.b }[4], [x27]\n"
- "ld1 { v25.b }[4], [x26]\n"
- "ld1 { v24.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
- "ld1 { v22.b }[4], [x23]\n"
- "ld1 { v21.b }[4], [x22]\n"
- "ld1 { v20.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x28]\n"
+ "ld1 { v24.b }[4], [x27]\n"
+ "ld1 { v23.b }[4], [x26]\n"
+ "ld1 { v22.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v20.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v18.b }[4], [x21]\n"
"b 11f\n"
"9:" // odd_loads_2_0
"tbz %x[width], #1, 10f\n"
- "ldr h27, [x28], #0x2\n"
- "ldr h26, [x27], #0x2\n"
+ "ldr h25, [x28], #0x2\n"
+ "ldr h24, [x27], #0x2\n"
"mov x20, #0x1\n"
- "ldr h25, [x26], #0x2\n"
- "ldr h24, [x25], #0x2\n"
- "ldr h23, [x24], #0x2\n"
- "ldr h22, [x23], #0x2\n"
- "ldr h21, [x22], #0x2\n"
- "ldr h20, [x21], #0x2\n"
+ "ldr h23, [x26], #0x2\n"
+ "ldr h22, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h19, [x22], #0x2\n"
+ "ldr h18, [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v27.b }[2], [x28]\n"
- "ld1 { v26.b }[2], [x27]\n"
- "ld1 { v25.b }[2], [x26]\n"
- "ld1 { v24.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
- "ld1 { v22.b }[2], [x23]\n"
- "ld1 { v21.b }[2], [x22]\n"
- "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x28]\n"
+ "ld1 { v24.b }[2], [x27]\n"
+ "ld1 { v23.b }[2], [x26]\n"
+ "ld1 { v22.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v20.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v18.b }[2], [x21]\n"
"b 11f\n"
"10:" // odd_loads_1_0
- "ldr b27, [x28, #0x0]\n"
- "ldr b26, [x27, #0x0]\n"
+ "ldr b25, [x28, #0x0]\n"
+ "ldr b24, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b25, [x26, #0x0]\n"
- "ldr b24, [x25, #0x0]\n"
- "ldr b23, [x24, #0x0]\n"
- "ldr b22, [x23, #0x0]\n"
- "ldr b21, [x22, #0x0]\n"
- "ldr b20, [x21, #0x0]\n"
+ "ldr b23, [x26, #0x0]\n"
+ "ldr b22, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b20, [x23, #0x0]\n"
+ "ldr b19, [x22, #0x0]\n"
+ "ldr b18, [x21, #0x0]\n"
"11:" // Odd load end
"subs x20, x20, #0x1\n"
- "zip1 v19.2d, v27.2d, v26.2d\n"
- "zip1 v18.2d, v25.2d, v24.2d\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
- "zip1 v16.2d, v21.2d, v20.2d\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip1 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v21.2d, v20.2d\n"
+ "zip1 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 12f\n"
- "zip2 v19.2d, v27.2d, v26.2d\n"
- "zip2 v18.2d, v25.2d, v24.2d\n"
- "zip2 v17.2d, v23.2d, v22.2d\n"
- "zip2 v16.2d, v21.2d, v20.2d\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.2d, v21.2d, v20.2d\n"
+ "zip2 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"12:" // Odds skip
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
index ff171984e7..8ed20a1e48 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
@@ -32,103 +32,103 @@ void interleave_block<1, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "sub x28, %x[width], #0x1\n"
- "mov x27, #0x0\n"
"cntw x22, ALL, MUL #2\n"
+ "sub x28, %x[width], #0x1\n"
"cntw x21, ALL, MUL #2\n"
"sub x20, x22, #0x1\n"
"whilelt p10.s, XZR, %x[height]\n"
"add x28, x28, x21\n"
- "ands x26, %x[width], x20\n"
+ "ands x27, %x[width], x20\n"
"udiv x28, x28, x21\n"
- "csel x26, x26, x22, NE\n"
+ "csel x27, x27, x22, NE\n"
+ "mov x26, #0x0\n"
"and x25, x28, #0x1\n"
"sub x28, x28, #0x1\n"
- "add x26, x26, #0x1\n"
+ "add x27, x27, #0x1\n"
"mov x20, %x[width]\n"
"ptrue p0.b\n"
"mov x24, %x[outptr_raw]\n"
"mov x23, %x[row_offset]\n"
"cntw x22\n"
"lsr x28, x28, #0x1\n"
- "lsr x26, x26, #0x1\n"
+ "lsr x27, x27, #0x1\n"
"mov x12, #0x0\n"
- ".inst 0x25b44771 // whilelt pn9.s, x27, x20, VLx2\n"
+ ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n"
"mov x21, %x[in]\n"
"1:" // Width loop: Preamble: Loop
"ldr x20, [x21], #0x8\n"
".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n"
- ".inst 0xa0174294 // ld1w { z20.s-z21.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- ".inst 0xc160e294 // bfcvt z20.h, { z20.s-z21.s }\n"
- ".inst 0xc0800280 // mova za0h.s[x12], p0/M, z20.s\n"
+ ".inst 0xa0174286 // ld1w { z6.s-z7.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+ ".inst 0xc160e0c6 // bfcvt z6.h, { z6.s-z7.s }\n"
+ ".inst 0xc08000c0 // mova za0h.s[x12], p0/M, z6.s\n"
"add x12, x12, #0x1\n"
"cmp x12, x22\n"
"blt 1b\n"
"incw x23, ALL, MUL #2\n"
- "incw x27, ALL, MUL #2\n"
+ "incw x26, ALL, MUL #2\n"
"cbz x28, 5f\n"
"2:" // Width loop
"mov x20, %x[width]\n"
"mov x12, #0x0\n"
- ".inst 0x25b44771 // whilelt pn9.s, x27, x20, VLx2\n"
+ ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n"
"mov x21, %x[in]\n"
"3:" // Width loop: Odd: Loop
"ldr x20, [x21], #0x8\n"
".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n"
- ".inst 0xc0828018 // mova z24.s, p0/M, za0v.s[x12]\n"
- ".inst 0xa0174294 // ld1w { z20.s-z21.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- "st1w { z24.s }, p0, [x24]\n"
- "addvl x24, x24, #1\n"
- ".inst 0xc160e294 // bfcvt z20.h, { z20.s-z21.s }\n"
- ".inst 0xc0800288 // mova za2h.s[x12], p0/M, z20.s\n"
+ ".inst 0xa017429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+ ".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n"
+ ".inst 0xc08003c8 // mova za2h.s[x12], p0/M, z30.s\n"
+ ".inst 0xc082800f // mova z15.s, p0/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x22\n"
+ "st1w { z15.s }, p0, [x24]\n"
+ "addvl x24, x24, #1\n"
"blt 3b\n"
- "incw x27, ALL, MUL #2\n"
+ "incw x26, ALL, MUL #2\n"
"mov x20, %x[width]\n"
"incw x23, ALL, MUL #2\n"
"mov x12, #0x0\n"
- ".inst 0x25b44771 // whilelt pn9.s, x27, x20, VLx2\n"
+ ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n"
"mov x21, %x[in]\n"
"4:" // Width loop: Even: Loop
"ldr x20, [x21], #0x8\n"
".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n"
- ".inst 0xc082810c // mova z12.s, p0/M, za2v.s[x12]\n"
- ".inst 0xa0174284 // ld1w { z4.s-z5.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- "st1w { z12.s }, p0, [x24]\n"
- "addvl x24, x24, #1\n"
- ".inst 0xc160e084 // bfcvt z4.h, { z4.s-z5.s }\n"
- ".inst 0xc0800080 // mova za0h.s[x12], p0/M, z4.s\n"
+ ".inst 0xa0174298 // ld1w { z24.s-z25.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+ ".inst 0xc160e318 // bfcvt z24.h, { z24.s-z25.s }\n"
+ ".inst 0xc0800300 // mova za0h.s[x12], p0/M, z24.s\n"
+ ".inst 0xc0828110 // mova z16.s, p0/M, za2v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x22\n"
+ "st1w { z16.s }, p0, [x24]\n"
+ "addvl x24, x24, #1\n"
"blt 4b\n"
"subs x28, x28, #0x1\n"
"incw x23, ALL, MUL #2\n"
- "incw x27, ALL, MUL #2\n"
+ "incw x26, ALL, MUL #2\n"
"bgt 2b\n"
"5:" // Width loop: Tails
"cbnz x25, 8f\n"
"mov x20, %x[width]\n"
"mov x12, #0x0\n"
- ".inst 0x25b44771 // whilelt pn9.s, x27, x20, VLx2\n"
+ ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n"
"mov x21, %x[in]\n"
"6:" // Width loop: Tails: Even: Odd: Loop
"ldr x20, [x21], #0x8\n"
".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n"
- ".inst 0xc0828010 // mova z16.s, p0/M, za0v.s[x12]\n"
".inst 0xa017428e // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- "st1w { z16.s }, p0, [x24]\n"
- "addvl x24, x24, #1\n"
".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n"
".inst 0xc08001c8 // mova za2h.s[x12], p0/M, z14.s\n"
+ ".inst 0xc0828010 // mova z16.s, p0/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x22\n"
+ "st1w { z16.s }, p0, [x24]\n"
+ "addvl x24, x24, #1\n"
"blt 6b\n"
"mov x12, #0x0\n"
"7:" // Width loop: Tails: Even: Even: Loop
".inst 0xc0828110 // mova z16.s, p0/M, za2v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x26\n"
+ "cmp x12, x27\n"
"st1w { z16.s }, p0, [x24]\n"
"addvl x24, x24, #1\n"
"blt 7b\n"
@@ -138,7 +138,7 @@ void interleave_block<1, 2, VLType::SME, false>(
"9:" // Width loop: Tails: Odd: Loop
".inst 0xc0828010 // mova z16.s, p0/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x26\n"
+ "cmp x12, x27\n"
"st1w { z16.s }, p0, [x24]\n"
"addvl x24, x24, #1\n"
"blt 9b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
index 6d3601d165..aaa01039cf 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
@@ -32,126 +32,126 @@ void interleave_block<2, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "sub x10, %x[width], #0x1\n"
- "mov x9, #0x0\n"
"cntw x22, ALL, MUL #2\n"
- "cntw x28\n"
+ "cntw x9\n"
+ "sub x28, %x[width], #0x1\n"
"cntw x21, ALL, MUL #2\n"
"sub x20, x22, #0x1\n"
".inst 0x25207815 // ptrue pn13.b\n"
"whilelt p12.s, XZR, %x[height]\n"
- "whilelt p11.s, x28, %x[height]\n"
- "add x10, x10, x21\n"
+ "whilelt p11.s, x9, %x[height]\n"
+ "add x28, x28, x21\n"
"ands x27, %x[width], x20\n"
- "udiv x10, x10, x21\n"
+ "udiv x28, x28, x21\n"
"csel x27, x27, x22, NE\n"
- "and x26, x10, #0x1\n"
- "sub x10, x10, #0x1\n"
+ "mov x26, #0x0\n"
+ "and x25, x28, #0x1\n"
+ "sub x28, x28, #0x1\n"
"add x27, x27, #0x1\n"
"mov x20, %x[width]\n"
- "mov x25, %x[in]\n"
+ "mov x24, %x[in]\n"
"ptrue p0.b\n"
- "mov x24, %x[outptr_raw]\n"
- "mov x23, %x[row_offset]\n"
- "lsr x10, x10, #0x1\n"
+ "mov x23, %x[outptr_raw]\n"
+ "mov x22, %x[row_offset]\n"
+ "lsr x28, x28, #0x1\n"
"lsr x27, x27, #0x1\n"
"mov x12, #0x0\n"
- ".inst 0x25b44532 // whilelt pn10.s, x9, x20, VLx2\n"
- "add x22, x25, x28, LSL #3\n"
+ ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n"
+ "add x21, x24, x9, LSL #3\n"
"1:" // Width loop: Preamble: Loop
- "ldr x21, [x25], #0x8\n"
+ "ldr x20, [x24], #0x8\n"
".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n"
".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n"
- "ldr x20, [x22], #0x8\n"
- ".inst 0xa01746b4 // ld1w { z20.s-z21.s }, pn9.s/Z, [x21, x23, LSL #2]\n"
- ".inst 0xa017428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- ".inst 0xc160e294 // bfcvt z20.h, { z20.s-z21.s }\n"
- ".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n"
- ".inst 0xc0800280 // mova za0h.s[x12], p0/M, z20.s\n"
- ".inst 0xc0800184 // mova za1h.s[x12], p0/M, z12.s\n"
+ ".inst 0xa0164698 // ld1w { z24.s-z25.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+ "ldr x20, [x21], #0x8\n"
+ ".inst 0xa0164296 // ld1w { z22.s-z23.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+ ".inst 0xc160e318 // bfcvt z24.h, { z24.s-z25.s }\n"
+ ".inst 0xc160e2d6 // bfcvt z22.h, { z22.s-z23.s }\n"
+ ".inst 0xc0800300 // mova za0h.s[x12], p0/M, z24.s\n"
+ ".inst 0xc08002c4 // mova za1h.s[x12], p0/M, z22.s\n"
"add x12, x12, #0x1\n"
- "cmp x12, x28\n"
+ "cmp x12, x9\n"
"blt 1b\n"
- "incw x23, ALL, MUL #2\n"
- "incw x9, ALL, MUL #2\n"
- "cbz x10, 5f\n"
+ "incw x22, ALL, MUL #2\n"
+ "incw x26, ALL, MUL #2\n"
+ "cbz x28, 5f\n"
"2:" // Width loop
"mov x20, %x[width]\n"
- "mov x25, %x[in]\n"
+ "mov x24, %x[in]\n"
"mov x12, #0x0\n"
- ".inst 0x25b44532 // whilelt pn10.s, x9, x20, VLx2\n"
- "add x22, x25, x28, LSL #3\n"
+ ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n"
+ "add x21, x24, x9, LSL #3\n"
"3:" // Width loop: Odd: Loop
- "ldr x21, [x25], #0x8\n"
+ "ldr x20, [x24], #0x8\n"
".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n"
".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n"
- ".inst 0xc0828007 // mova z7.s, p0/M, za0v.s[x12]\n"
- "ldr x20, [x22], #0x8\n"
- ".inst 0xc082808f // mova z15.s, p0/M, za1v.s[x12]\n"
- ".inst 0xa01746b6 // ld1w { z22.s-z23.s }, pn9.s/Z, [x21, x23, LSL #2]\n"
- ".inst 0xa017429a // ld1w { z26.s-z27.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- ".inst 0xa1605707 // st1w { z7.s, z15.s }, pn13.b, [x24]\n"
- "addvl x24, x24, #2\n"
+ ".inst 0xa0164696 // ld1w { z22.s-z23.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+ "ldr x20, [x21], #0x8\n"
+ ".inst 0xa016428a // ld1w { z10.s-z11.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
".inst 0xc160e2d6 // bfcvt z22.h, { z22.s-z23.s }\n"
- ".inst 0xc160e35a // bfcvt z26.h, { z26.s-z27.s }\n"
+ ".inst 0xc160e14a // bfcvt z10.h, { z10.s-z11.s }\n"
".inst 0xc08002c8 // mova za2h.s[x12], p0/M, z22.s\n"
- ".inst 0xc080034c // mova za3h.s[x12], p0/M, z26.s\n"
+ ".inst 0xc080014c // mova za3h.s[x12], p0/M, z10.s\n"
+ ".inst 0xc0828008 // mova z8.s, p0/M, za0v.s[x12]\n"
+ ".inst 0xc0828089 // mova z9.s, p0/M, za1v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x28\n"
+ "cmp x12, x9\n"
+ ".inst 0xa06056e8 // st1w { z8.s-z9.s }, pn13.b, [x23]\n"
+ "addvl x23, x23, #2\n"
"blt 3b\n"
- "incw x9, ALL, MUL #2\n"
+ "incw x26, ALL, MUL #2\n"
"mov x20, %x[width]\n"
- "mov x25, %x[in]\n"
- "incw x23, ALL, MUL #2\n"
+ "mov x24, %x[in]\n"
+ "incw x22, ALL, MUL #2\n"
"mov x12, #0x0\n"
- ".inst 0x25b44532 // whilelt pn10.s, x9, x20, VLx2\n"
- "add x22, x25, x28, LSL #3\n"
+ ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n"
+ "add x21, x24, x9, LSL #3\n"
"4:" // Width loop: Even: Loop
- "ldr x21, [x25], #0x8\n"
+ "ldr x20, [x24], #0x8\n"
".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n"
".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n"
- ".inst 0xc0828108 // mova z8.s, p0/M, za2v.s[x12]\n"
- "ldr x20, [x22], #0x8\n"
- ".inst 0xc0828189 // mova z9.s, p0/M, za3v.s[x12]\n"
- ".inst 0xa01746ae // ld1w { z14.s-z15.s }, pn9.s/Z, [x21, x23, LSL #2]\n"
- ".inst 0xa017428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- ".inst 0xa0605708 // st1w { z8.s-z9.s }, pn13.b, [x24]\n"
- "addvl x24, x24, #2\n"
- ".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n"
- ".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n"
- ".inst 0xc08001c0 // mova za0h.s[x12], p0/M, z14.s\n"
- ".inst 0xc0800184 // mova za1h.s[x12], p0/M, z12.s\n"
+ ".inst 0xa016469a // ld1w { z26.s-z27.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+ "ldr x20, [x21], #0x8\n"
+ ".inst 0xa016429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+ ".inst 0xc160e35a // bfcvt z26.h, { z26.s-z27.s }\n"
+ ".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n"
+ ".inst 0xc0800340 // mova za0h.s[x12], p0/M, z26.s\n"
+ ".inst 0xc08003c4 // mova za1h.s[x12], p0/M, z30.s\n"
+ ".inst 0xc0828106 // mova z6.s, p0/M, za2v.s[x12]\n"
+ ".inst 0xc082818e // mova z14.s, p0/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x28\n"
+ "cmp x12, x9\n"
+ ".inst 0xa16056e6 // st1w { z6.s, z14.s }, pn13.b, [x23]\n"
+ "addvl x23, x23, #2\n"
"blt 4b\n"
- "subs x10, x10, #0x1\n"
- "incw x23, ALL, MUL #2\n"
- "incw x9, ALL, MUL #2\n"
+ "subs x28, x28, #0x1\n"
+ "incw x22, ALL, MUL #2\n"
+ "incw x26, ALL, MUL #2\n"
"bgt 2b\n"
"5:" // Width loop: Tails
- "cbnz x26, 8f\n"
+ "cbnz x25, 8f\n"
"mov x20, %x[width]\n"
- "mov x25, %x[in]\n"
+ "mov x24, %x[in]\n"
"mov x12, #0x0\n"
- ".inst 0x25b44532 // whilelt pn10.s, x9, x20, VLx2\n"
- "add x22, x25, x28, LSL #3\n"
+ ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n"
+ "add x21, x24, x9, LSL #3\n"
"6:" // Width loop: Tails: Even: Odd: Loop
- "ldr x21, [x25], #0x8\n"
+ "ldr x20, [x24], #0x8\n"
".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n"
".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n"
- ".inst 0xc0828003 // mova z3.s, p0/M, za0v.s[x12]\n"
- "ldr x20, [x22], #0x8\n"
- ".inst 0xc082808b // mova z11.s, p0/M, za1v.s[x12]\n"
- ".inst 0xa01746ac // ld1w { z12.s-z13.s }, pn9.s/Z, [x21, x23, LSL #2]\n"
- ".inst 0xa017428e // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
- ".inst 0xa1605703 // st1w { z3.s, z11.s }, pn13.b, [x24]\n"
- "addvl x24, x24, #2\n"
+ ".inst 0xa016468c // ld1w { z12.s-z13.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+ "ldr x20, [x21], #0x8\n"
+ ".inst 0xa016428e // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n"
".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n"
".inst 0xc0800188 // mova za2h.s[x12], p0/M, z12.s\n"
".inst 0xc08001cc // mova za3h.s[x12], p0/M, z14.s\n"
+ ".inst 0xc0828007 // mova z7.s, p0/M, za0v.s[x12]\n"
+ ".inst 0xc082808f // mova z15.s, p0/M, za1v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x28\n"
+ "cmp x12, x9\n"
+ ".inst 0xa16056e7 // st1w { z7.s, z15.s }, pn13.b, [x23]\n"
+ "addvl x23, x23, #2\n"
"blt 6b\n"
"mov x12, #0x0\n"
"7:" // Width loop: Tails: Even: Even: Loop
@@ -159,8 +159,8 @@ void interleave_block<2, 2, VLType::SME, false>(
".inst 0xc082818f // mova z15.s, p0/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x27\n"
- ".inst 0xa060570e // st1w { z14.s-z15.s }, pn13.b, [x24]\n"
- "addvl x24, x24, #2\n"
+ ".inst 0xa06056ee // st1w { z14.s-z15.s }, pn13.b, [x23]\n"
+ "addvl x23, x23, #2\n"
"blt 7b\n"
"b 10f\n"
"8:" // Width loop: Tails: Odd
@@ -170,15 +170,15 @@ void interleave_block<2, 2, VLType::SME, false>(
".inst 0xc0828095 // mova z21.s, p0/M, za1v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x27\n"
- ".inst 0xa0605714 // st1w { z20.s-z21.s }, pn13.b, [x24]\n"
- "addvl x24, x24, #2\n"
+ ".inst 0xa06056f4 // st1w { z20.s-z21.s }, pn13.b, [x23]\n"
+ "addvl x23, x23, #2\n"
"blt 9b\n"
"10:" // End
- "mov %x[outptr_raw], x24\n"
+ "mov %x[outptr_raw], x23\n"
".inst 0xd503467f // SMSTOP\n"
: [outptr_raw] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
index a8187f78e8..83a7f62693 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
@@ -32,63 +32,63 @@ void interleave_block<4, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "sub x14, %x[width], #0x1\n"
- "mov x13, %x[in]\n"
"cntw x23, ALL, MUL #2\n"
- "cntw x11\n"
+ "cntw x10\n"
"cntw x22, ALL, MUL #2\n"
"cntw x20, ALL, MUL #3\n"
"sub x21, x23, #0x1\n"
".inst 0x25207817 // ptrue pn15.b\n"
- "whilelt p2.s, XZR, %x[height]\n"
- "whilelt p1.s, x11, %x[height]\n"
- "whilelt p14.s, x22, %x[height]\n"
- "whilelt p13.s, x20, %x[height]\n"
+ "whilelt p1.s, XZR, %x[height]\n"
+ "whilelt p14.s, x10, %x[height]\n"
+ "whilelt p13.s, x22, %x[height]\n"
+ "whilelt p12.s, x20, %x[height]\n"
+ "sub x9, %x[width], #0x1\n"
"cntw x20, ALL, MUL #2\n"
- "ands x10, %x[width], x21\n"
- "add x14, x14, x20\n"
- "csel x10, x10, x23, NE\n"
- "add x9, x13, x11, LSL #3\n"
- "mov x28, #0x0\n"
- "udiv x14, x14, x20\n"
- "add x10, x10, #0x1\n"
+ "ands x28, %x[width], x21\n"
+ "mov x27, %x[in]\n"
+ "add x9, x9, x20\n"
+ "csel x28, x28, x23, NE\n"
+ "add x26, x27, x10, LSL #3\n"
+ "mov x25, #0x0\n"
+ "udiv x9, x9, x20\n"
+ "add x28, x28, #0x1\n"
"mov x20, %x[width]\n"
- "add x27, x9, x11, LSL #3\n"
+ "add x24, x26, x10, LSL #3\n"
"ptrue p0.b\n"
- "mov x26, %x[outptr_raw]\n"
- "mov x25, %x[row_offset]\n"
- "sub x14, x14, #0x1\n"
- "lsr x10, x10, #0x1\n"
+ "mov x23, %x[outptr_raw]\n"
+ "mov x22, %x[row_offset]\n"
+ "sub x9, x9, #0x1\n"
+ "lsr x28, x28, #0x1\n"
"mov x12, #0x0\n"
- ".inst 0x25b44794 // whilelt pn12.s, x28, x20, VLx2\n"
- "add x24, x27, x11, LSL #3\n"
+ ".inst 0x25b44733 // whilelt pn11.s, x25, x20, VLx2\n"
+ "add x21, x24, x10, LSL #3\n"
"1:" // Width loop: Preamble: Loop
- "ldr x23, [x13], #0x8\n"
- ".inst 0x2530704b // psel p11.s, p12.s/Z, p2.s[w12]\n"
- ".inst 0x2530702a // psel p10.s, p12.s/Z, p1.s[w12]\n"
- "ldr x22, [x9], #0x8\n"
- ".inst 0x253071c9 // psel p9.s, p12.s/Z, p14.s[w12]\n"
- ".inst 0x253071a8 // psel p8.s, p12.s/Z, p13.s[w12]\n"
- "ldr x21, [x27], #0x8\n"
+ "ldr x20, [x27], #0x8\n"
+ ".inst 0x25306c28 // psel p8.s, p11.s/Z, p1.s[w12]\n"
+ ".inst 0x25306dca // psel p10.s, p11.s/Z, p14.s[w12]\n"
+ ".inst 0xa0164298 // ld1w { z24.s-z25.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+ "ldr x20, [x26], #0x8\n"
+ ".inst 0x25306da9 // psel p9.s, p11.s/Z, p13.s[w12]\n"
+ ".inst 0x25306d88 // psel p8.s, p11.s/Z, p12.s[w12]\n"
+ ".inst 0xa0164a82 // ld1w { z2.s-z3.s }, pn10.s/Z, [x20, x22, LSL #2]\n"
"ldr x20, [x24], #0x8\n"
- ".inst 0xa0194eea // ld1w { z10.s-z11.s }, pn11.s/Z, [x23, x25, LSL #2]\n"
- ".inst 0xa0194ada // ld1w { z26.s-z27.s }, pn10.s/Z, [x22, x25, LSL #2]\n"
- ".inst 0xa01946be // ld1w { z30.s-z31.s }, pn9.s/Z, [x21, x25, LSL #2]\n"
- ".inst 0xa019428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x25, LSL #2]\n"
+ ".inst 0xa016468a // ld1w { z10.s-z11.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+ ".inst 0xc160e318 // bfcvt z24.h, { z24.s-z25.s }\n"
+ ".inst 0xc160e042 // bfcvt z2.h, { z2.s-z3.s }\n"
+ "ldr x20, [x21], #0x8\n"
+ ".inst 0xa016428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
".inst 0xc160e14a // bfcvt z10.h, { z10.s-z11.s }\n"
- ".inst 0xc160e35a // bfcvt z26.h, { z26.s-z27.s }\n"
- ".inst 0xc0800140 // mova za0h.s[x12], p0/M, z10.s\n"
- ".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n"
- ".inst 0xc0800344 // mova za1h.s[x12], p0/M, z26.s\n"
".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n"
- ".inst 0xc08003c8 // mova za2h.s[x12], p0/M, z30.s\n"
+ ".inst 0xc0800300 // mova za0h.s[x12], p0/M, z24.s\n"
+ ".inst 0xc0800044 // mova za1h.s[x12], p0/M, z2.s\n"
+ ".inst 0xc0800148 // mova za2h.s[x12], p0/M, z10.s\n"
".inst 0xc080018c // mova za3h.s[x12], p0/M, z12.s\n"
"add x12, x12, #0x1\n"
- "cmp x12, x11\n"
+ "cmp x12, x10\n"
"blt 1b\n"
+ "incw x22, ALL, MUL #2\n"
"incw x25, ALL, MUL #2\n"
- "incw x28, ALL, MUL #2\n"
- "cbz x14, 5f\n"
+ "cbz x9, 5f\n"
"2:" // Width loop
"mov x12, #0x0\n"
"3:" // Width loop: Store: Loop
@@ -97,44 +97,44 @@ void interleave_block<4, 2, VLType::SME, false>(
".inst 0xc0828119 // mova z25.s, p0/M, za2v.s[x12]\n"
".inst 0xc082819d // mova z29.s, p0/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x11\n"
- ".inst 0xa160df51 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x26]\n"
- "addvl x26, x26, #4\n"
+ "cmp x12, x10\n"
+ ".inst 0xa160def1 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x23]\n"
+ "addvl x23, x23, #4\n"
"blt 3b\n"
- "mov x13, %x[in]\n"
+ "mov x27, %x[in]\n"
+ "add x26, x27, x10, LSL #3\n"
"mov x20, %x[width]\n"
- "add x9, x13, x11, LSL #3\n"
+ "add x24, x26, x10, LSL #3\n"
"mov x12, #0x0\n"
- "add x27, x9, x11, LSL #3\n"
- ".inst 0x25b44794 // whilelt pn12.s, x28, x20, VLx2\n"
- "add x24, x27, x11, LSL #3\n"
+ ".inst 0x25b44733 // whilelt pn11.s, x25, x20, VLx2\n"
+ "add x21, x24, x10, LSL #3\n"
"4:" // Width loop: Load: Loop
- "ldr x23, [x13], #0x8\n"
- ".inst 0x2530704b // psel p11.s, p12.s/Z, p2.s[w12]\n"
- ".inst 0x2530702a // psel p10.s, p12.s/Z, p1.s[w12]\n"
- "ldr x22, [x9], #0x8\n"
- ".inst 0x253071c9 // psel p9.s, p12.s/Z, p14.s[w12]\n"
- ".inst 0x253071a8 // psel p8.s, p12.s/Z, p13.s[w12]\n"
- "ldr x21, [x27], #0x8\n"
+ "ldr x20, [x27], #0x8\n"
+ ".inst 0x25306c28 // psel p8.s, p11.s/Z, p1.s[w12]\n"
+ ".inst 0x25306dca // psel p10.s, p11.s/Z, p14.s[w12]\n"
+ ".inst 0xa016428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+ "ldr x20, [x26], #0x8\n"
+ ".inst 0x25306da9 // psel p9.s, p11.s/Z, p13.s[w12]\n"
+ ".inst 0x25306d88 // psel p8.s, p11.s/Z, p12.s[w12]\n"
+ ".inst 0xa0164a8e // ld1w { z14.s-z15.s }, pn10.s/Z, [x20, x22, LSL #2]\n"
"ldr x20, [x24], #0x8\n"
- ".inst 0xa0194eec // ld1w { z12.s-z13.s }, pn11.s/Z, [x23, x25, LSL #2]\n"
- ".inst 0xa0194ace // ld1w { z14.s-z15.s }, pn10.s/Z, [x22, x25, LSL #2]\n"
- ".inst 0xa01946b2 // ld1w { z18.s-z19.s }, pn9.s/Z, [x21, x25, LSL #2]\n"
- ".inst 0xa019429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x25, LSL #2]\n"
+ ".inst 0xa0164692 // ld1w { z18.s-z19.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n"
".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n"
- ".inst 0xc0800180 // mova za0h.s[x12], p0/M, z12.s\n"
+ "ldr x20, [x21], #0x8\n"
+ ".inst 0xa016429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
".inst 0xc160e252 // bfcvt z18.h, { z18.s-z19.s }\n"
- ".inst 0xc08001c4 // mova za1h.s[x12], p0/M, z14.s\n"
".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n"
+ ".inst 0xc0800180 // mova za0h.s[x12], p0/M, z12.s\n"
+ ".inst 0xc08001c4 // mova za1h.s[x12], p0/M, z14.s\n"
".inst 0xc0800248 // mova za2h.s[x12], p0/M, z18.s\n"
".inst 0xc08003cc // mova za3h.s[x12], p0/M, z30.s\n"
"add x12, x12, #0x1\n"
- "cmp x12, x11\n"
+ "cmp x12, x10\n"
"blt 4b\n"
- "subs x14, x14, #0x1\n"
+ "subs x9, x9, #0x1\n"
+ "incw x22, ALL, MUL #2\n"
"incw x25, ALL, MUL #2\n"
- "incw x28, ALL, MUL #2\n"
"bgt 2b\n"
"5:" // Width loop: Tails
"mov x12, #0x0\n"
@@ -144,16 +144,16 @@ void interleave_block<4, 2, VLType::SME, false>(
".inst 0xc0828119 // mova z25.s, p0/M, za2v.s[x12]\n"
".inst 0xc082819d // mova z29.s, p0/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x10\n"
- ".inst 0xa160df51 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x26]\n"
- "addvl x26, x26, #4\n"
+ "cmp x12, x28\n"
+ ".inst 0xa160def1 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x23]\n"
+ "addvl x23, x23, #4\n"
"blt 6b\n"
"7:" // End
- "mov %x[outptr_raw], x26\n"
+ "mov %x[outptr_raw], x23\n"
".inst 0xd503467f // SMSTOP\n"
: [outptr_raw] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
index bc9f68ed72..45f660fec1 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
@@ -33,25 +33,25 @@ void interleave_block<1, 1, VLType::SME, false>(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"mov x21, %x[width]\n"
- "mov x20, %x[width]\n"
"inch x21\n"
"cnth x11\n"
"sub x21, x21, #0x1\n"
- "sub x10, x11, #0x1\n"
"udiv x21, x21, x11\n" // n_passes = ceildiv(width, VL<T>)
- "ands x10, x20, x10\n"
+ "mov x20, %x[width]\n"
+ "sub x10, x11, #0x1\n"
"sub x9, x21, #0x1\n"
+ "ands x10, x20, x10\n"
"sub x28, x11, #0x2\n"
"lsl x20, %x[height], #0x1\n" // height * 2
"mov x27, #0x0\n"
"mov x26, %x[in]\n"
"lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "ldr x25, [x26, #0x0]\n"
+ "and x24, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"csel x10, x10, x11, NE\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x23, [x26, #0x8]\n"
"ptrue p11.h\n"
"whilelt p10.h, XZR, x20\n"
- "ldr x23, [x26, #0x8]\n"
"mov x22, %x[row_offset]\n"
"mov x21, %x[out]\n"
"whilelt p9.h, x27, %x[width]\n"
@@ -60,119 +60,119 @@ void interleave_block<1, 1, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x0]\n"
".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
"add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "cmp x12, x28\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
"mov x26, %x[in]\n"
- "inch x27\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ "inch x22\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "inch x22\n"
+ "inch x27\n"
"cbz x9, 8f\n"
"mov x20, x9\n"
"3:" // K loop: Main loop
"whilelt p8.h, x27, %x[width]\n"
- "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25296143 // psel p3.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25396142 // psel p2.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0x25296d21 // psel p1.h, p11.h/Z, p9.h[w13]\n"
- ".inst 0x25396d20 // psel p0.h, p11.h/Z, p9.h[w13, #1]\n"
- ".inst 0xe0562f08 // ld1h { za1h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0562ae9 // ld1h { za1h.h[x13, #1] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
- ".inst 0xe07fa6a0 // st1h { za0v.h[x13] }, p1/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+ ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"add x26, x26, #0x10\n"
- ".inst 0xe06ba2a1 // st1h { za0v.h[x13, #1] }, p0/Z, [x21, x11, LSL #1]\n"
- "add x13, x13, #0x2\n"
"addvl x21, x21, #2\n"
- "cmp x13, x28\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25296143 // psel p3.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25396142 // psel p2.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0x25296d21 // psel p1.h, p11.h/Z, p9.h[w13]\n"
- ".inst 0x25396d20 // psel p0.h, p11.h/Z, p9.h[w13, #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0562f08 // ld1h { za1h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
"inch x27\n"
- "mov x12, #0x0\n"
- ".inst 0xe0562ae9 // ld1h { za1h.h[x13, #1] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x26, #0x8]\n"
- ".inst 0xe07fa6a0 // st1h { za0v.h[x13] }, p1/Z, [x21, XZR, LSL #1]\n"
"add x26, x26, #0x10\n"
- ".inst 0xe06ba2a1 // st1h { za0v.h[x13, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+ ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
"addvl x21, x21, #2\n"
"inch x22\n"
"whilelt p8.h, x27, %x[width]\n"
+ "mov x12, #0x0\n"
"cbz x28, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25286143 // psel p3.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25386142 // psel p2.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0560f00 // ld1h { za0h.h[x12] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0560ae1 // ld1h { za0h.h[x12, #1] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
- ".inst 0xe07f86a8 // st1h { za1v.h[x12] }, p1/Z, [x21, XZR, LSL #1]\n"
- "add x26, x26, #0x10\n"
+ ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
"add x12, x12, #0x2\n"
- "addvl x21, x21, #2\n"
"cmp x12, x28\n"
+ "add x26, x26, #0x10\n"
+ "addvl x21, x21, #2\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25286143 // psel p3.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25386142 // psel p2.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0560f00 // ld1h { za0h.h[x12] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
"subs x20, x20, #0x1\n"
- "inch x27\n"
- ".inst 0xe0560ae1 // ld1h { za0h.h[x12, #1] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x26, #0x8]\n"
- ".inst 0xe07f86a8 // st1h { za1v.h[x12] }, p1/Z, [x21, XZR, LSL #1]\n"
"add x26, x26, #0x10\n"
".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
"addvl x21, x21, #2\n"
+ "inch x27\n"
"inch x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x26, %x[in]\n"
"whilelt p8.h, x27, %x[width]\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0xe07f86a0 // st1h { za0v.h[x12] }, p1/Z, [x21, XZR, LSL #1]\n"
- "addvl x21, x21, #1\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
"ldr x20, [x26, #0x0]\n"
- "add x26, x26, #0x8\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
".inst 0xe0560288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
"add x12, x12, #0x1\n"
"cmp x12, x11\n"
+ "add x26, x26, #0x8\n"
+ "addvl x21, x21, #1\n"
"blt 9b\n"
"whilelt p9.h, x27, %x[width]\n"
"whilelt p8.h, x27, %x[width]\n"
@@ -181,8 +181,8 @@ void interleave_block<1, 1, VLType::SME, false>(
".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
"blt 10b\n"
"whilelt p8.h, x27, %x[width]\n"
"b 13f\n"
@@ -192,15 +192,15 @@ void interleave_block<1, 1, VLType::SME, false>(
".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
index 793bc80524..ce7192afe6 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
@@ -32,28 +32,28 @@ void interleave_block<1, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x22, %x[width]\n"
+ "cnth x22\n"
"mov x21, %x[width]\n"
- "cnth x20\n"
- "inch x22\n"
- "sub x11, x20, #0x1\n"
- "sub x22, x22, #0x1\n"
- "ands x11, x21, x11\n"
+ "inch x21\n"
+ "mov x20, %x[width]\n"
+ "sub x11, x22, #0x1\n"
+ "sub x21, x21, #0x1\n"
+ "ands x11, x20, x11\n"
"cntw x10\n"
- "udiv x22, x22, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x11, x11, x20, NE\n"
- "sub x9, x22, #0x1\n"
+ "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x11, x11, x22, NE\n"
+ "sub x9, x21, #0x1\n"
"add x11, x11, #0x1\n"
"sub x28, x10, #0x2\n"
"lsl x20, %x[height], #0x1\n" // height * 2
"mov x27, #0x0\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
"lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "ldr x24, [x26, #0x0]\n"
+ "and x24, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "ldr x23, [x26, #0x8]\n"
"lsr x11, x11, #0x1\n"
"ptrue p11.s\n"
- "ldr x23, [x26, #0x8]\n"
"whilelt p10.h, XZR, x20\n"
"mov x22, %x[row_offset]\n"
"mov x21, %x[out]\n"
@@ -63,124 +63,124 @@ void interleave_block<1, 2, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x0]\n"
".inst 0xe05602e2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
"add x12, x12, #0x4\n"
+ "cmp x12, x28, LSL #1\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "cmp x12, x28, LSL #1\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
"mov x26, %x[in]\n"
- "inch x27\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
".inst 0xe05602e2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ "inch x22\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "inch x22\n"
+ "inch x27\n"
"cbz x9, 8f\n"
"mov x20, x9\n"
"3:" // K loop: Main loop
"whilelt p8.h, x27, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "mov x14, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25386143 // psel p3.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25786142 // psel p2.h, p8.h/Z, p10.h[w12, #3]\n"
- ".inst 0x252a6d21 // psel p1.h, p11.h/Z, p9.h[w14]\n"
- ".inst 0x253a6d20 // psel p0.h, p11.h/Z, p9.h[w14, #1]\n"
- ".inst 0xe0560f01 // ld1h { za0h.h[x12, #1] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0560ae3 // ld1h { za0h.h[x12, #3] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
+ ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0x25796141 // psel p1.h, p8.h/Z, p10.h[w13, #3]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe05626e3 // ld1h { za0h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"add x26, x26, #0x10\n"
- "add x12, x12, #0x4\n"
- ".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
- ".inst 0xe0aac2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x10, LSL #2]\n"
- "add x14, x14, #0x2\n"
"addvl x21, x21, #2\n"
- "cmp x14, x28\n"
+ "add x13, x13, #0x4\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25386143 // psel p3.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25786142 // psel p2.h, p8.h/Z, p10.h[w12, #3]\n"
- ".inst 0x252a6d21 // psel p1.h, p11.h/Z, p9.h[w14]\n"
- ".inst 0x253a6d20 // psel p0.h, p11.h/Z, p9.h[w14, #1]\n"
+ ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
+ ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25796141 // psel p1.h, p8.h/Z, p10.h[w13, #3]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe05626e3 // ld1h { za0h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0560f01 // ld1h { za0h.h[x12, #1] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
"inch x27\n"
- "mov x13, #0x0\n"
- ".inst 0xe0560ae3 // ld1h { za0h.h[x12, #3] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
+ ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+ "addvl x21, x21, #2\n"
"inch x22\n"
- ".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
"whilelt p8.h, x27, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- ".inst 0xe0aac2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x10, LSL #2]\n"
- "addvl x21, x21, #2\n"
"cbz x28, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25296143 // psel p3.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25696142 // psel p2.h, p8.h/Z, p10.h[w13, #2]\n"
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0562f00 // ld1h { za0h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0562ae2 // ld1h { za0h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0x25296140 // psel p0.h, p8.h/Z, p10.h[w13]\n"
+ ".inst 0xe0562320 // ld1h { za0h.h[x13] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0x25696141 // psel p1.h, p8.h/Z, p10.h[w13, #2]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe05626e2 // ld1h { za0h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- "add x13, x13, #0x4\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"add x12, x12, #0x2\n"
- "addvl x21, x21, #2\n"
"cmp x12, x28\n"
+ "add x26, x26, #0x10\n"
+ "addvl x21, x21, #2\n"
+ "add x13, x13, #0x4\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25296143 // psel p3.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25696142 // psel p2.h, p8.h/Z, p10.h[w13, #2]\n"
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+ ".inst 0x25296140 // psel p0.h, p8.h/Z, p10.h[w13]\n"
+ ".inst 0xe0562320 // ld1h { za0h.h[x13] }, p0/Z, [x25, x22, LSL #1]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25696141 // psel p1.h, p8.h/Z, p10.h[w13, #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe05626e2 // ld1h { za0h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0562f00 // ld1h { za0h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
"subs x20, x20, #0x1\n"
- "inch x27\n"
- ".inst 0xe0562ae2 // ld1h { za0h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "inch x22\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"addvl x21, x21, #2\n"
+ "inch x27\n"
+ "inch x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x26, %x[in]\n"
"whilelt p8.h, x27, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"ldr x20, [x26, #0x0]\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
"cmp x12, x10\n"
- "add x26, x26, #0x8\n"
".inst 0xe0562281 // ld1h { za0h.h[x13, #1] }, p0/Z, [x20, x22, LSL #1]\n"
+ "add x26, x26, #0x8\n"
+ "addvl x21, x21, #1\n"
"add x13, x13, #0x2\n"
"blt 9b\n"
"whilelt p9.h, x27, %x[width]\n"
@@ -189,11 +189,11 @@ void interleave_block<1, 2, VLType::SME, false>(
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x2\n"
".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x11\n"
+ "addvl x21, x21, #1\n"
+ "add x20, x20, #0x2\n"
"blt 10b\n"
"whilelt p8.h, x27, %x[width]\n"
"b 13f\n"
@@ -203,15 +203,15 @@ void interleave_block<1, 2, VLType::SME, false>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x11\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp
index 0e0e4e462c..982cfa6d40 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(__ARM_FEATURE_SVE)
template <>
void interleave_block<1, 2, VLType::SME, false>(
@@ -63,11 +63,11 @@ void interleave_block<1, 2, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
+ ".inst 0x25286143 // psel p3.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25686142 // psel p2.h, p8.h/Z, p10.h[w12, #2]\n"
+ ".inst 0xe0560f00 // ld1h { za0h.h[x12] }, p3/Z, [x24, x22, LSL #1]\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe05602e2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe0560ae2 // ld1h { za0h.h[x12, #2] }, p2/Z, [x23, x22, LSL #1]\n"
"add x12, x12, #0x4\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
@@ -88,20 +88,20 @@ void interleave_block<1, 2, VLType::SME, false>(
"mov x20, x9\n"
"3:" // K loop: Main loop
"whilelt p8.h, x27, %x[width]\n"
- "mov x12, #0x0\n"
+ "mov x15, #0x0\n"
"mov x14, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25386143 // psel p3.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25786142 // psel p2.h, p8.h/Z, p10.h[w12, #3]\n"
+ ".inst 0x253b6143 // psel p3.h, p8.h/Z, p10.h[w15, #1]\n"
+ ".inst 0x257b6142 // psel p2.h, p8.h/Z, p10.h[w15, #3]\n"
".inst 0x252a6d21 // psel p1.h, p11.h/Z, p9.h[w14]\n"
".inst 0x253a6d20 // psel p0.h, p11.h/Z, p9.h[w14, #1]\n"
- ".inst 0xe0560f01 // ld1h { za0h.h[x12, #1] }, p3/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0566f01 // ld1h { za0h.h[x15, #1] }, p3/Z, [x24, x22, LSL #1]\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe0560ae3 // ld1h { za0h.h[x12, #3] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe0566ae3 // ld1h { za0h.h[x15, #3] }, p2/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "add x12, x12, #0x4\n"
+ "add x15, x15, #0x4\n"
".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
".inst 0xe0aac2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"add x14, x14, #0x2\n"
@@ -109,23 +109,23 @@ void interleave_block<1, 2, VLType::SME, false>(
"cmp x14, x28\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25386143 // psel p3.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25786142 // psel p2.h, p8.h/Z, p10.h[w12, #3]\n"
+ ".inst 0x253b6143 // psel p3.h, p8.h/Z, p10.h[w15, #1]\n"
+ ".inst 0x257b6142 // psel p2.h, p8.h/Z, p10.h[w15, #3]\n"
".inst 0x252a6d21 // psel p1.h, p11.h/Z, p9.h[w14]\n"
".inst 0x253a6d20 // psel p0.h, p11.h/Z, p9.h[w14, #1]\n"
"mov x26, %x[in]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0560f01 // ld1h { za0h.h[x12, #1] }, p3/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0566f01 // ld1h { za0h.h[x15, #1] }, p3/Z, [x24, x22, LSL #1]\n"
"ldr x24, [x26, #0x0]\n"
"inch x27\n"
"mov x13, #0x0\n"
- ".inst 0xe0560ae3 // ld1h { za0h.h[x12, #3] }, p2/Z, [x23, x22, LSL #1]\n"
+ "whilelt p8.h, x27, %x[width]\n"
+ "mov x12, #0x0\n"
+ ".inst 0xe0566ae3 // ld1h { za0h.h[x15, #3] }, p2/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
"inch x22\n"
".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
- "whilelt p8.h, x27, %x[width]\n"
- "mov x12, #0x0\n"
".inst 0xe0aac2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"addvl x21, x21, #2\n"
"cbz x28, 7f\n"
@@ -172,15 +172,15 @@ void interleave_block<1, 2, VLType::SME, false>(
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25396143 // psel p3.h, p8.h/Z, p10.h[w13, #1]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x24, [x26, #0x0]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
- "ldr x20, [x26, #0x0]\n"
- "cmp x12, x10\n"
"add x26, x26, #0x8\n"
- ".inst 0xe0562281 // ld1h { za0h.h[x13, #1] }, p0/Z, [x20, x22, LSL #1]\n"
+ "cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
+ ".inst 0xe0562f01 // ld1h { za0h.h[x13, #1] }, p3/Z, [x24, x22, LSL #1]\n"
"add x13, x13, #0x2\n"
"blt 9b\n"
"whilelt p9.h, x27, %x[width]\n"
@@ -195,7 +195,7 @@ void interleave_block<1, 2, VLType::SME, false>(
"addvl x21, x21, #1\n"
"cmp x12, x11\n"
"blt 10b\n"
- "whilelt p8.h, x27, %x[width]\n"
+ "whilelt p9.h, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -203,16 +203,16 @@ void interleave_block<1, 2, VLType::SME, false>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x11\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
-#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
index 558a5d7637..79cd668a84 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
@@ -32,16 +32,16 @@ void interleave_block<1, 4, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
"incb x23\n"
- "sub x10, x20, #0x1\n"
+ "mov x20, %x[width]\n"
+ "sub x10, x21, #0x1\n"
"cntw x9\n"
"sub x23, x23, #0x1\n"
- "ands x10, x21, x10\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x10, x10, x20, NE\n"
+ "ands x10, x20, x10\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x10, x10, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
"lsl x21, x9, #0x1\n"
"sub x20, x23, #0x1\n"
@@ -52,12 +52,12 @@ void interleave_block<1, 4, VLType::SME, false>(
"mov x27, #0x0\n"
"mov x26, %x[in]\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "ldr x25, [x26, #0x0]\n"
+ "and x24, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"lsr x10, x10, #0x2\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x23, [x26, #0x8]\n"
"ptrue p11.s\n"
"zip1 p10.b, p9.b, p8.b\n"
- "ldr x23, [x26, #0x8]\n"
"mov x22, %x[row_offset]\n"
"mov x21, %x[out]\n"
"whilelt p9.b, x27, %x[width]\n"
@@ -66,124 +66,124 @@ void interleave_block<1, 4, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
- ".inst 0xe0160700 // ld1b { za0h.b[x12] }, p1/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x0]\n"
".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
"add x12, x12, #0x8\n"
+ "cmp x12, x28, LSL #2\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "cmp x12, x28, LSL #2\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
"mov x26, %x[in]\n"
- "incb x27\n"
- ".inst 0xe0160700 // ld1b { za0h.b[x12] }, p1/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
+ "ldr x25, [x26, #0x0]\n"
+ "incb x22\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "incb x22\n"
+ "incb x27\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
"3:" // K loop: Main loop
"whilelt p8.b, x27, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "mov x14, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- ".inst 0xe0160f02 // ld1b { za0h.b[x12, #2] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0160ae6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x23, x22]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+ ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
"ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"add x26, x26, #0x10\n"
- "add x12, x12, #0x8\n"
- ".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
- ".inst 0xe0a9c2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x9, LSL #2]\n"
- "add x14, x14, #0x2\n"
"addvl x21, x21, #2\n"
- "cmp x14, x28\n"
+ "add x13, x13, #0x8\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.b, x27, %x[width]\n"
- ".inst 0xe0160f02 // ld1b { za0h.b[x12, #2] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
"incb x27\n"
- "mov x13, #0x0\n"
- ".inst 0xe0160ae6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x23, x22]\n"
- "ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
+ ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+ "addvl x21, x21, #2\n"
"incb x22\n"
- ".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
"whilelt p8.b, x27, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- ".inst 0xe0a9c2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x9, LSL #2]\n"
- "addvl x21, x21, #2\n"
"cbz x28, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- ".inst 0xe0162f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0162ae4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x23, x22]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
+ ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
"ldr x23, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- "add x13, x13, #0x8\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
"add x12, x12, #0x2\n"
- "addvl x21, x21, #2\n"
"cmp x12, x28\n"
+ "add x26, x26, #0x10\n"
+ "addvl x21, x21, #2\n"
+ "add x13, x13, #0x8\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.b, x27, %x[width]\n"
- ".inst 0xe0162f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
"subs x20, x20, #0x1\n"
- "incb x27\n"
- ".inst 0xe0162ae4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x23, x22]\n"
- "ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "incb x22\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
"addvl x21, x21, #2\n"
+ "incb x27\n"
+ "incb x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x26, %x[in]\n"
"whilelt p8.b, x27, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"ldr x20, [x26, #0x0]\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
"cmp x12, x9\n"
- "add x26, x26, #0x8\n"
".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+ "add x26, x26, #0x8\n"
+ "addvl x21, x21, #1\n"
"add x13, x13, #0x4\n"
"blt 9b\n"
"whilelt p9.b, x27, %x[width]\n"
@@ -192,11 +192,11 @@ void interleave_block<1, 4, VLType::SME, false>(
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x4\n"
".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
+ "add x20, x20, #0x4\n"
"blt 10b\n"
"whilelt p8.b, x27, %x[width]\n"
"b 13f\n"
@@ -206,15 +206,15 @@ void interleave_block<1, 4, VLType::SME, false>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
index ba8be81ade..fe98bd86b5 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
@@ -32,18 +32,18 @@ void interleave_block<1, 4, VLType::SME, true>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
- "incb x23\n"
"mov z18.b, #0x1\n"
+ "incb x23\n"
+ "mov x20, %x[width]\n"
"mov z17.s, #0x0\n"
- "sub x10, x20, #0x1\n"
+ "sub x10, x21, #0x1\n"
"cntw x9\n"
"sub x23, x23, #0x1\n"
- "ands x10, x21, x10\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x10, x10, x20, NE\n"
+ "ands x10, x20, x10\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x10, x10, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
"lsl x21, x9, #0x1\n"
"sub x20, x23, #0x1\n"
@@ -51,7 +51,7 @@ void interleave_block<1, 4, VLType::SME, true>(
"whilelt p9.b, XZR, x22\n"
"whilelt p8.b, x21, x22\n"
"mov x28, #0x0\n"
- "ptrue p4.b\n"
+ "ptrue p2.b\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
"and x27, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"lsr x10, x10, #0x2\n"
@@ -64,132 +64,132 @@ void interleave_block<1, 4, VLType::SME, true>(
"whilelt p8.b, x28, %x[width]\n"
"cbnz %x[first], 1f\n"
"addvl x24, x24, #-1\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
+ "ld1w { z17.s }, p2/Z, [x24]\n"
"1:" // K loop: Load row sums: End
"mov x23, %x[in]\n"
- "mov x12, #0x0\n"
"ldr x22, [x23, #0x0]\n"
+ "mov x12, #0x0\n"
"ldr x21, [x23, #0x8]\n"
"add x23, x23, #0x10\n"
"cbz x26, 3f\n"
"2:" // K loop: Charge: Loop
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
- ".inst 0xe01906c0 // ld1b { za0h.b[x12] }, p1/Z, [x22, x25]\n"
"ldr x22, [x23, #0x0]\n"
".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
"add x12, x12, #0x8\n"
+ "cmp x12, x26, LSL #2\n"
"ldr x21, [x23, #0x8]\n"
"add x23, x23, #0x10\n"
- "cmp x12, x26, LSL #2\n"
"blt 2b\n"
"3:" // K loop: Charge: End
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
"mov x23, %x[in]\n"
- "incb x28\n"
- ".inst 0xe01906c0 // ld1b { za0h.b[x12] }, p1/Z, [x22, x25]\n"
- "ldr x22, [x23, #0x0]\n"
".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
+ "ldr x22, [x23, #0x0]\n"
+ "incb x25\n"
"ldr x21, [x23, #0x8]\n"
"add x23, x23, #0x10\n"
- "incb x25\n"
+ "incb x28\n"
"cbz x20, 9f\n"
"mov x20, x20\n"
"4:" // K loop: Main loop
"whilelt p8.b, x28, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "mov x14, #0x0\n"
"cbz x26, 6f\n"
"5:" // K loop: Main loop: First: Loop
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- ".inst 0xe0190ec2 // ld1b { za0h.b[x12, #2] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
"ldr x22, [x23, #0x0]\n"
- ".inst 0xe0190aa6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x21, x25]\n"
+ ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ "sdot z17.s, z16.b, z18.b\n"
"ldr x21, [x23, #0x8]\n"
- "add x23, x23, #0x10\n"
- "add x12, x12, #0x8\n"
- ".inst 0xc082d010 // mova z16.s, p4/M, za0v.s[x14]\n"
- ".inst 0xe0bfc700 // st1w { za0v.s[x14] }, p1/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n"
+ ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x26\n"
"sdot z17.s, z16.b, z18.b\n"
- ".inst 0xe0a9c301 // st1w { za0v.s[x14, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- ".inst 0xc082d030 // mova z16.s, p4/M, za0v.s[x14, #1]\n"
- "add x14, x14, #0x2\n"
- "cmp x14, x26\n"
+ "add x23, x23, #0x10\n"
"addvl x24, x24, #2\n"
- "sdot z17.s, z16.b, z18.b\n"
+ "add x13, x13, #0x8\n"
"blt 5b\n"
"6:" // K loop: Main loop: First: Tail
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ "sdot z17.s, z16.b, z18.b\n"
"mov x23, %x[in]\n"
- "whilelt p9.b, x28, %x[width]\n"
- ".inst 0xe0190ec2 // ld1b { za0h.b[x12, #2] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x23, #0x0]\n"
- "incb x28\n"
- "mov x13, #0x0\n"
- ".inst 0xe0190aa6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x21, x25]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n"
"ldr x21, [x23, #0x8]\n"
+ ".inst 0xe0bf8700 // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+ "whilelt p9.b, x28, %x[width]\n"
+ "incb x28\n"
"add x23, x23, #0x10\n"
+ ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+ "sdot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #2\n"
"incb x25\n"
- ".inst 0xc082d010 // mova z16.s, p4/M, za0v.s[x14]\n"
- ".inst 0xe0bfc700 // st1w { za0v.s[x14] }, p1/Z, [x24, XZR, LSL #2]\n"
"whilelt p8.b, x28, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "sdot z17.s, z16.b, z18.b\n"
- ".inst 0xc082d030 // mova z16.s, p4/M, za0v.s[x14, #1]\n"
- ".inst 0xe0a9c301 // st1w { za0v.s[x14, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- "addvl x24, x24, #2\n"
- "sdot z17.s, z16.b, z18.b\n"
"cbz x26, 8f\n"
"7:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- ".inst 0xe0192ec0 // ld1b { za0h.b[x13] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
"ldr x22, [x23, #0x0]\n"
- ".inst 0xe0192aa4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x21, x25]\n"
- "ldr x21, [x23, #0x8]\n"
- "add x23, x23, #0x10\n"
- "add x13, x13, #0x8\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
- ".inst 0xe0bf8708 // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
"sdot z17.s, z16.b, z18.b\n"
+ "ldr x21, [x23, #0x8]\n"
+ ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n"
".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x26\n"
- "addvl x24, x24, #2\n"
"sdot z17.s, z16.b, z18.b\n"
+ "add x23, x23, #0x10\n"
+ "addvl x24, x24, #2\n"
+ "add x13, x13, #0x8\n"
"blt 7b\n"
"8:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
+ "sdot z17.s, z16.b, z18.b\n"
"mov x23, %x[in]\n"
- "whilelt p9.b, x28, %x[width]\n"
- ".inst 0xe0192ec0 // ld1b { za0h.b[x13] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x23, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "incb x28\n"
- ".inst 0xe0192aa4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x21, x25]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n"
"ldr x21, [x23, #0x8]\n"
- "add x23, x23, #0x10\n"
- "incb x25\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0xe0bf8708 // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
- "sdot z17.s, z16.b, z18.b\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+ "whilelt p9.b, x28, %x[width]\n"
+ "subs x20, x20, #0x1\n"
+ "add x23, x23, #0x10\n"
".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- "addvl x24, x24, #2\n"
"sdot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #2\n"
+ "incb x28\n"
+ "incb x25\n"
"bgt 4b\n"
"9:" // K loop: Tails
"cbnz x27, 12f\n"
@@ -198,17 +198,17 @@ void interleave_block<1, 4, VLType::SME, true>(
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ "ldr x20, [x23, #0x0]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ "add x12, x12, #0x1\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
"sdot z17.s, z16.b, z18.b\n"
- ".inst 0xe0bf8700 // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x24, x24, #1\n"
- "ldr x20, [x23, #0x0]\n"
+ ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
"cmp x12, x9\n"
"add x23, x23, #0x8\n"
- ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
+ "addvl x24, x24, #1\n"
"add x13, x13, #0x4\n"
"blt 10b\n"
"whilelt p9.b, x28, %x[width]\n"
@@ -217,13 +217,13 @@ void interleave_block<1, 4, VLType::SME, true>(
"mov x12, #0x0\n"
"11:" // K loop: Tails: Even: Second
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
- "add x20, x20, #0x4\n"
- "sdot z17.s, z16.b, z18.b\n"
".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
"add x12, x12, #0x1\n"
- "addvl x24, x24, #1\n"
"cmp x12, x10\n"
+ "sdot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #1\n"
+ "add x20, x20, #0x4\n"
"blt 11b\n"
"whilelt p8.b, x28, %x[width]\n"
"b 14f\n"
@@ -231,21 +231,21 @@ void interleave_block<1, 4, VLType::SME, true>(
"mov x12, #0x0\n"
"13:" // K loop: Tails: Odd: Loop
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
- "sdot z17.s, z16.b, z18.b\n"
".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
- "addvl x24, x24, #1\n"
"cmp x12, x10\n"
+ "sdot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #1\n"
"blt 13b\n"
"14:" // K loop: End
- "st1w { z17.s }, p4, [x24]\n"
+ "st1w { z17.s }, p2, [x24]\n"
"addvl x24, x24, #1\n"
- ".inst 0xd503467f // SMSTOP\n"
"mov %x[out], x24\n"
+ ".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
index 6d1c1a207f..bc7d013798 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
@@ -32,16 +32,16 @@ void interleave_block<1, 4, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
"incb x23\n"
- "sub x10, x20, #0x1\n"
+ "mov x20, %x[width]\n"
+ "sub x10, x21, #0x1\n"
"cntw x9\n"
"sub x23, x23, #0x1\n"
- "ands x10, x21, x10\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x10, x10, x20, NE\n"
+ "ands x10, x20, x10\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x10, x10, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
"lsl x21, x9, #0x1\n"
"sub x20, x23, #0x1\n"
@@ -52,12 +52,12 @@ void interleave_block<1, 4, VLType::SME, false>(
"mov x27, #0x0\n"
"mov x26, %x[in]\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "ldr x25, [x26, #0x0]\n"
+ "and x24, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"lsr x10, x10, #0x2\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x23, [x26, #0x8]\n"
"ptrue p11.s\n"
"zip1 p10.b, p9.b, p8.b\n"
- "ldr x23, [x26, #0x8]\n"
"mov x22, %x[row_offset]\n"
"mov x21, %x[out]\n"
"whilelt p9.b, x27, %x[width]\n"
@@ -66,124 +66,124 @@ void interleave_block<1, 4, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
- ".inst 0xe0160700 // ld1b { za0h.b[x12] }, p1/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x0]\n"
".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
"add x12, x12, #0x8\n"
+ "cmp x12, x28, LSL #2\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "cmp x12, x28, LSL #2\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
"mov x26, %x[in]\n"
- "incb x27\n"
- ".inst 0xe0160700 // ld1b { za0h.b[x12] }, p1/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
+ "ldr x25, [x26, #0x0]\n"
+ "incb x22\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "incb x22\n"
+ "incb x27\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
"3:" // K loop: Main loop
"whilelt p8.b, x27, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "mov x14, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- ".inst 0xe0160f02 // ld1b { za0h.b[x12, #2] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0160ae6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x23, x22]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+ ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
"ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"add x26, x26, #0x10\n"
- "add x12, x12, #0x8\n"
- ".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
- ".inst 0xe0a9c2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x9, LSL #2]\n"
- "add x14, x14, #0x2\n"
"addvl x21, x21, #2\n"
- "cmp x14, x28\n"
+ "add x13, x13, #0x8\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.b, x27, %x[width]\n"
- ".inst 0xe0160f02 // ld1b { za0h.b[x12, #2] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
"incb x27\n"
- "mov x13, #0x0\n"
- ".inst 0xe0160ae6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x23, x22]\n"
- "ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
+ ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+ "addvl x21, x21, #2\n"
"incb x22\n"
- ".inst 0xe0bfc6a0 // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
"whilelt p8.b, x27, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- ".inst 0xe0a9c2a1 // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x9, LSL #2]\n"
- "addvl x21, x21, #2\n"
"cbz x28, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- ".inst 0xe0162f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0162ae4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x23, x22]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
+ ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
"ldr x23, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- "add x13, x13, #0x8\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
"add x12, x12, #0x2\n"
- "addvl x21, x21, #2\n"
"cmp x12, x28\n"
+ "add x26, x26, #0x10\n"
+ "addvl x21, x21, #2\n"
+ "add x13, x13, #0x8\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.b, x27, %x[width]\n"
- ".inst 0xe0162f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x22]\n"
- "ldr x24, [x26, #0x0]\n"
"subs x20, x20, #0x1\n"
- "incb x27\n"
- ".inst 0xe0162ae4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x23, x22]\n"
- "ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "incb x22\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
"addvl x21, x21, #2\n"
+ "incb x27\n"
+ "incb x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x26, %x[in]\n"
"whilelt p8.b, x27, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"ldr x20, [x26, #0x0]\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
"cmp x12, x9\n"
- "add x26, x26, #0x8\n"
".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+ "add x26, x26, #0x8\n"
+ "addvl x21, x21, #1\n"
"add x13, x13, #0x4\n"
"blt 9b\n"
"whilelt p9.b, x27, %x[width]\n"
@@ -192,11 +192,11 @@ void interleave_block<1, 4, VLType::SME, false>(
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x4\n"
".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
+ "add x20, x20, #0x4\n"
"blt 10b\n"
"whilelt p8.b, x27, %x[width]\n"
"b 13f\n"
@@ -206,15 +206,15 @@ void interleave_block<1, 4, VLType::SME, false>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
index dbcd18678b..66fcd800d4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
@@ -32,18 +32,18 @@ void interleave_block<1, 4, VLType::SME, true>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
- "incb x23\n"
"mov z18.b, #0x1\n"
+ "incb x23\n"
+ "mov x20, %x[width]\n"
"mov z17.s, #0x0\n"
- "sub x10, x20, #0x1\n"
+ "sub x10, x21, #0x1\n"
"cntw x9\n"
"sub x23, x23, #0x1\n"
- "ands x10, x21, x10\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x10, x10, x20, NE\n"
+ "ands x10, x20, x10\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x10, x10, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
"lsl x21, x9, #0x1\n"
"sub x20, x23, #0x1\n"
@@ -51,7 +51,7 @@ void interleave_block<1, 4, VLType::SME, true>(
"whilelt p9.b, XZR, x22\n"
"whilelt p8.b, x21, x22\n"
"mov x28, #0x0\n"
- "ptrue p4.b\n"
+ "ptrue p2.b\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
"and x27, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"lsr x10, x10, #0x2\n"
@@ -64,132 +64,132 @@ void interleave_block<1, 4, VLType::SME, true>(
"whilelt p8.b, x28, %x[width]\n"
"cbnz %x[first], 1f\n"
"addvl x24, x24, #-1\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
+ "ld1w { z17.s }, p2/Z, [x24]\n"
"1:" // K loop: Load row sums: End
"mov x23, %x[in]\n"
- "mov x12, #0x0\n"
"ldr x22, [x23, #0x0]\n"
+ "mov x12, #0x0\n"
"ldr x21, [x23, #0x8]\n"
"add x23, x23, #0x10\n"
"cbz x26, 3f\n"
"2:" // K loop: Charge: Loop
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
- ".inst 0xe01906c0 // ld1b { za0h.b[x12] }, p1/Z, [x22, x25]\n"
"ldr x22, [x23, #0x0]\n"
".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
"add x12, x12, #0x8\n"
+ "cmp x12, x26, LSL #2\n"
"ldr x21, [x23, #0x8]\n"
"add x23, x23, #0x10\n"
- "cmp x12, x26, LSL #2\n"
"blt 2b\n"
"3:" // K loop: Charge: End
- ".inst 0x25246141 // psel p1.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
"mov x23, %x[in]\n"
- "incb x28\n"
- ".inst 0xe01906c0 // ld1b { za0h.b[x12] }, p1/Z, [x22, x25]\n"
- "ldr x22, [x23, #0x0]\n"
".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
+ "ldr x22, [x23, #0x0]\n"
+ "incb x25\n"
"ldr x21, [x23, #0x8]\n"
"add x23, x23, #0x10\n"
- "incb x25\n"
+ "incb x28\n"
"cbz x20, 9f\n"
"mov x20, x20\n"
"4:" // K loop: Main loop
"whilelt p8.b, x28, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "mov x14, #0x0\n"
"cbz x26, 6f\n"
"5:" // K loop: Main loop: First: Loop
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- ".inst 0xe0190ec2 // ld1b { za0h.b[x12, #2] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
"ldr x22, [x23, #0x0]\n"
- ".inst 0xe0190aa6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x21, x25]\n"
+ ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ "udot z17.s, z16.b, z18.b\n"
"ldr x21, [x23, #0x8]\n"
- "add x23, x23, #0x10\n"
- "add x12, x12, #0x8\n"
- ".inst 0xc082d010 // mova z16.s, p4/M, za0v.s[x14]\n"
- ".inst 0xe0bfc700 // st1w { za0v.s[x14] }, p1/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n"
+ ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x26\n"
"udot z17.s, z16.b, z18.b\n"
- ".inst 0xe0a9c301 // st1w { za0v.s[x14, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- ".inst 0xc082d030 // mova z16.s, p4/M, za0v.s[x14, #1]\n"
- "add x14, x14, #0x2\n"
- "cmp x14, x26\n"
+ "add x23, x23, #0x10\n"
"addvl x24, x24, #2\n"
- "udot z17.s, z16.b, z18.b\n"
+ "add x13, x13, #0x8\n"
"blt 5b\n"
"6:" // K loop: Main loop: First: Tail
- ".inst 0x25346143 // psel p3.b, p8.b/Z, p10.b[w12, #2]\n"
- ".inst 0x25746142 // psel p2.b, p8.b/Z, p10.b[w12, #6]\n"
- ".inst 0x25266d21 // psel p1.b, p11.b/Z, p9.b[w14]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ "udot z17.s, z16.b, z18.b\n"
"mov x23, %x[in]\n"
- "whilelt p9.b, x28, %x[width]\n"
- ".inst 0xe0190ec2 // ld1b { za0h.b[x12, #2] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x23, #0x0]\n"
- "incb x28\n"
- "mov x13, #0x0\n"
- ".inst 0xe0190aa6 // ld1b { za0h.b[x12, #6] }, p2/Z, [x21, x25]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n"
"ldr x21, [x23, #0x8]\n"
+ ".inst 0xe0bf8700 // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+ "whilelt p9.b, x28, %x[width]\n"
+ "incb x28\n"
"add x23, x23, #0x10\n"
+ ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+ "udot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #2\n"
"incb x25\n"
- ".inst 0xc082d010 // mova z16.s, p4/M, za0v.s[x14]\n"
- ".inst 0xe0bfc700 // st1w { za0v.s[x14] }, p1/Z, [x24, XZR, LSL #2]\n"
"whilelt p8.b, x28, %x[width]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "udot z17.s, z16.b, z18.b\n"
- ".inst 0xc082d030 // mova z16.s, p4/M, za0v.s[x14, #1]\n"
- ".inst 0xe0a9c301 // st1w { za0v.s[x14, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- "addvl x24, x24, #2\n"
- "udot z17.s, z16.b, z18.b\n"
"cbz x26, 8f\n"
"7:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- ".inst 0xe0192ec0 // ld1b { za0h.b[x13] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
"ldr x22, [x23, #0x0]\n"
- ".inst 0xe0192aa4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x21, x25]\n"
- "ldr x21, [x23, #0x8]\n"
- "add x23, x23, #0x10\n"
- "add x13, x13, #0x8\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
- ".inst 0xe0bf8708 // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
"udot z17.s, z16.b, z18.b\n"
+ "ldr x21, [x23, #0x8]\n"
+ ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n"
".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x26\n"
- "addvl x24, x24, #2\n"
"udot z17.s, z16.b, z18.b\n"
+ "add x23, x23, #0x10\n"
+ "addvl x24, x24, #2\n"
+ "add x13, x13, #0x8\n"
"blt 7b\n"
"8:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x25656142 // psel p2.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x25246d21 // psel p1.b, p11.b/Z, p9.b[w12]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
+ "udot z17.s, z16.b, z18.b\n"
"mov x23, %x[in]\n"
- "whilelt p9.b, x28, %x[width]\n"
- ".inst 0xe0192ec0 // ld1b { za0h.b[x13] }, p3/Z, [x22, x25]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x23, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "incb x28\n"
- ".inst 0xe0192aa4 // ld1b { za0h.b[x13, #4] }, p2/Z, [x21, x25]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n"
"ldr x21, [x23, #0x8]\n"
- "add x23, x23, #0x10\n"
- "incb x25\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0xe0bf8708 // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
- "udot z17.s, z16.b, z18.b\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+ "whilelt p9.b, x28, %x[width]\n"
+ "subs x20, x20, #0x1\n"
+ "add x23, x23, #0x10\n"
".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
- "addvl x24, x24, #2\n"
"udot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #2\n"
+ "incb x28\n"
+ "incb x25\n"
"bgt 4b\n"
"9:" // K loop: Tails
"cbnz x27, 12f\n"
@@ -198,17 +198,17 @@ void interleave_block<1, 4, VLType::SME, true>(
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ "ldr x20, [x23, #0x0]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ "add x12, x12, #0x1\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
"udot z17.s, z16.b, z18.b\n"
- ".inst 0xe0bf8700 // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x24, x24, #1\n"
- "ldr x20, [x23, #0x0]\n"
+ ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
"cmp x12, x9\n"
"add x23, x23, #0x8\n"
- ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
+ "addvl x24, x24, #1\n"
"add x13, x13, #0x4\n"
"blt 10b\n"
"whilelt p9.b, x28, %x[width]\n"
@@ -217,13 +217,13 @@ void interleave_block<1, 4, VLType::SME, true>(
"mov x12, #0x0\n"
"11:" // K loop: Tails: Even: Second
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
- "add x20, x20, #0x4\n"
- "udot z17.s, z16.b, z18.b\n"
".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
"add x12, x12, #0x1\n"
- "addvl x24, x24, #1\n"
"cmp x12, x10\n"
+ "udot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #1\n"
+ "add x20, x20, #0x4\n"
"blt 11b\n"
"whilelt p8.b, x28, %x[width]\n"
"b 14f\n"
@@ -231,21 +231,21 @@ void interleave_block<1, 4, VLType::SME, true>(
"mov x12, #0x0\n"
"13:" // K loop: Tails: Odd: Loop
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
- "udot z17.s, z16.b, z18.b\n"
".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
- "addvl x24, x24, #1\n"
"cmp x12, x10\n"
+ "udot z17.s, z16.b, z18.b\n"
+ "addvl x24, x24, #1\n"
"blt 13b\n"
"14:" // K loop: End
- "st1w { z17.s }, p4, [x24]\n"
+ "st1w { z17.s }, p2, [x24]\n"
"addvl x24, x24, #1\n"
- ".inst 0xd503467f // SMSTOP\n"
"mov %x[out], x24\n"
+ ".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
index 591c08dcb2..fb0a74b3bd 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
@@ -33,25 +33,25 @@ void interleave_block<1, 1, VLType::SME, false>(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"mov x21, %x[width]\n"
- "mov x20, %x[width]\n"
"inch x21\n"
"cnth x11\n"
"sub x21, x21, #0x1\n"
- "sub x10, x11, #0x1\n"
"udiv x21, x21, x11\n" // n_passes = ceildiv(width, VL<T>)
- "ands x10, x20, x10\n"
+ "mov x20, %x[width]\n"
+ "sub x10, x11, #0x1\n"
"sub x9, x21, #0x1\n"
+ "ands x10, x20, x10\n"
"sub x28, x11, #0x2\n"
"lsl x20, %x[height], #0x1\n" // height * 2
"mov x27, #0x0\n"
"mov x26, %x[in]\n"
"lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "ldr x25, [x26, #0x0]\n"
+ "and x24, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"csel x10, x10, x11, NE\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x23, [x26, #0x8]\n"
"ptrue p11.h\n"
"whilelt p10.h, XZR, x20\n"
- "ldr x23, [x26, #0x8]\n"
"mov x22, %x[row_offset]\n"
"mov x21, %x[out]\n"
"whilelt p9.h, x27, %x[width]\n"
@@ -60,119 +60,119 @@ void interleave_block<1, 1, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x0]\n"
".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
"add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "cmp x12, x28\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25286141 // psel p1.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
"mov x26, %x[in]\n"
- "inch x27\n"
- ".inst 0xe0560700 // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ "inch x22\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "inch x22\n"
+ "inch x27\n"
"cbz x9, 8f\n"
"mov x20, x9\n"
"3:" // K loop: Main loop
"whilelt p8.h, x27, %x[width]\n"
- "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25296143 // psel p3.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25396142 // psel p2.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0x25296d21 // psel p1.h, p11.h/Z, p9.h[w13]\n"
- ".inst 0x25396d20 // psel p0.h, p11.h/Z, p9.h[w13, #1]\n"
- ".inst 0xe0562f08 // ld1h { za1h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0562ae9 // ld1h { za1h.h[x13, #1] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
- ".inst 0xe07fa6a0 // st1h { za0v.h[x13] }, p1/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+ ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"add x26, x26, #0x10\n"
- ".inst 0xe06ba2a1 // st1h { za0v.h[x13, #1] }, p0/Z, [x21, x11, LSL #1]\n"
- "add x13, x13, #0x2\n"
"addvl x21, x21, #2\n"
- "cmp x13, x28\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25296143 // psel p3.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25396142 // psel p2.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0x25296d21 // psel p1.h, p11.h/Z, p9.h[w13]\n"
- ".inst 0x25396d20 // psel p0.h, p11.h/Z, p9.h[w13, #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0562f08 // ld1h { za1h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
"inch x27\n"
- "mov x12, #0x0\n"
- ".inst 0xe0562ae9 // ld1h { za1h.h[x13, #1] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x26, #0x8]\n"
- ".inst 0xe07fa6a0 // st1h { za0v.h[x13] }, p1/Z, [x21, XZR, LSL #1]\n"
"add x26, x26, #0x10\n"
- ".inst 0xe06ba2a1 // st1h { za0v.h[x13, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+ ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
"addvl x21, x21, #2\n"
"inch x22\n"
"whilelt p8.h, x27, %x[width]\n"
+ "mov x12, #0x0\n"
"cbz x28, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25286143 // psel p3.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25386142 // psel p2.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0560f00 // ld1h { za0h.h[x12] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0560ae1 // ld1h { za0h.h[x12, #1] }, p2/Z, [x23, x22, LSL #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
"ldr x23, [x26, #0x8]\n"
- ".inst 0xe07f86a8 // st1h { za1v.h[x12] }, p1/Z, [x21, XZR, LSL #1]\n"
- "add x26, x26, #0x10\n"
+ ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
"add x12, x12, #0x2\n"
- "addvl x21, x21, #2\n"
"cmp x12, x28\n"
+ "add x26, x26, #0x10\n"
+ "addvl x21, x21, #2\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25286143 // psel p3.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25386142 // psel p2.h, p8.h/Z, p10.h[w12, #1]\n"
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
+ ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+ ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
"whilelt p9.h, x27, %x[width]\n"
- ".inst 0xe0560f00 // ld1h { za0h.h[x12] }, p3/Z, [x24, x22, LSL #1]\n"
- "ldr x24, [x26, #0x0]\n"
"subs x20, x20, #0x1\n"
- "inch x27\n"
- ".inst 0xe0560ae1 // ld1h { za0h.h[x12, #1] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x26, #0x8]\n"
- ".inst 0xe07f86a8 // st1h { za1v.h[x12] }, p1/Z, [x21, XZR, LSL #1]\n"
"add x26, x26, #0x10\n"
".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
"addvl x21, x21, #2\n"
+ "inch x27\n"
"inch x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x26, %x[in]\n"
"whilelt p8.h, x27, %x[width]\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25286d21 // psel p1.h, p11.h/Z, p9.h[w12]\n"
- ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0xe07f86a0 // st1h { za0v.h[x12] }, p1/Z, [x21, XZR, LSL #1]\n"
- "addvl x21, x21, #1\n"
+ ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
+ ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
"ldr x20, [x26, #0x0]\n"
- "add x26, x26, #0x8\n"
+ ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
".inst 0xe0560288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
"add x12, x12, #0x1\n"
"cmp x12, x11\n"
+ "add x26, x26, #0x8\n"
+ "addvl x21, x21, #1\n"
"blt 9b\n"
"whilelt p9.h, x27, %x[width]\n"
"whilelt p8.h, x27, %x[width]\n"
@@ -181,8 +181,8 @@ void interleave_block<1, 1, VLType::SME, false>(
".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
"blt 10b\n"
"whilelt p8.h, x27, %x[width]\n"
"b 13f\n"
@@ -192,15 +192,15 @@ void interleave_block<1, 1, VLType::SME, false>(
".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x10\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
index b76ec57d22..3fe3885068 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
@@ -32,24 +32,24 @@ void interleave_block<1, 1, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x21, %x[width]\n"
- "mov x20, %x[width]\n"
- "incw x21\n"
+ "mov x22, %x[width]\n"
+ "incw x22\n"
"cntw x10\n"
- "sub x21, x21, #0x1\n"
+ "sub x22, x22, #0x1\n"
+ "udiv x22, x22, x10\n" // n_passes = ceildiv(width, VL<T>)
+ "mov x21, %x[width]\n"
"sub x9, x10, #0x1\n"
- "udiv x21, x21, x10\n" // n_passes = ceildiv(width, VL<T>)
- "ands x9, x20, x9\n"
- "sub x20, x21, #0x1\n"
+ "sub x20, x22, #0x1\n"
+ "ands x9, x21, x9\n"
"sub x28, x10, #0x2\n"
"mov x27, #0x0\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "ldr x24, [x26, #0x0]\n"
+ "and x24, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "ldr x23, [x26, #0x8]\n"
"csel x9, x9, x10, NE\n"
"ptrue p11.s\n"
- "ldr x23, [x26, #0x8]\n"
"whilelt p10.s, XZR, %x[height]\n"
"mov x22, %x[row_offset]\n"
"mov x21, %x[out]\n"
@@ -59,119 +59,119 @@ void interleave_block<1, 1, VLType::SME, false>(
"mov x12, #0x0\n"
"cbz x28, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25306141 // psel p1.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
".inst 0x25706140 // psel p0.s, p8.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0960700 // ld1w { za0h.s[x12] }, p1/Z, [x24, x22, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x0]\n"
".inst 0xe09602e1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
"add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "cmp x12, x28\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25306141 // psel p1.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
".inst 0x25706140 // psel p0.s, p8.s/Z, p10.s[w12, #1]\n"
"mov x26, %x[in]\n"
- "incw x27\n"
- ".inst 0xe0960700 // ld1w { za0h.s[x12] }, p1/Z, [x24, x22, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
".inst 0xe09602e1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+ "ldr x25, [x26, #0x0]\n"
+ "incw x22\n"
"ldr x23, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- "incw x22\n"
+ "incw x27\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
"3:" // K loop: Main loop
"whilelt p8.s, x27, %x[width]\n"
- "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x28, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25316143 // psel p3.s, p8.s/Z, p10.s[w13]\n"
- ".inst 0x25716142 // psel p2.s, p8.s/Z, p10.s[w13, #1]\n"
- ".inst 0x25316d21 // psel p1.s, p11.s/Z, p9.s[w13]\n"
- ".inst 0x25716d20 // psel p0.s, p11.s/Z, p9.s[w13, #1]\n"
- ".inst 0xe0962f08 // ld1w { za2h.s[x13] }, p3/Z, [x24, x22, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0962ae9 // ld1w { za2h.s[x13, #1] }, p2/Z, [x23, x22, LSL #2]\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe09606e9 // ld1w { za2h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
"ldr x23, [x26, #0x8]\n"
- ".inst 0xe0bfa6a0 // st1w { za0v.s[x13] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28\n"
"add x26, x26, #0x10\n"
- ".inst 0xe0aaa2a1 // st1w { za0v.s[x13, #1] }, p0/Z, [x21, x10, LSL #2]\n"
- "add x13, x13, #0x2\n"
"addvl x21, x21, #2\n"
- "cmp x13, x28\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25316143 // psel p3.s, p8.s/Z, p10.s[w13]\n"
- ".inst 0x25716142 // psel p2.s, p8.s/Z, p10.s[w13, #1]\n"
- ".inst 0x25316d21 // psel p1.s, p11.s/Z, p9.s[w13]\n"
- ".inst 0x25716d20 // psel p0.s, p11.s/Z, p9.s[w13, #1]\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe09606e9 // ld1w { za2h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.s, x27, %x[width]\n"
- ".inst 0xe0962f08 // ld1w { za2h.s[x13] }, p3/Z, [x24, x22, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
"incw x27\n"
- "mov x12, #0x0\n"
- ".inst 0xe0962ae9 // ld1w { za2h.s[x13, #1] }, p2/Z, [x23, x22, LSL #2]\n"
- "ldr x23, [x26, #0x8]\n"
- ".inst 0xe0bfa6a0 // st1w { za0v.s[x13] }, p1/Z, [x21, XZR, LSL #2]\n"
"add x26, x26, #0x10\n"
- ".inst 0xe0aaa2a1 // st1w { za0v.s[x13, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+ ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"addvl x21, x21, #2\n"
"incw x22\n"
"whilelt p8.s, x27, %x[width]\n"
+ "mov x12, #0x0\n"
"cbz x28, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25306143 // psel p3.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0x25706142 // psel p2.s, p8.s/Z, p10.s[w12, #1]\n"
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xe0960f00 // ld1w { za0h.s[x12] }, p3/Z, [x24, x22, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe0960ae1 // ld1w { za0h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe09606e1 // ld1w { za0h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
"ldr x23, [x26, #0x8]\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
- "add x26, x26, #0x10\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"add x12, x12, #0x2\n"
- "addvl x21, x21, #2\n"
"cmp x12, x28\n"
+ "add x26, x26, #0x10\n"
+ "addvl x21, x21, #2\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25306143 // psel p3.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0x25706142 // psel p2.s, p8.s/Z, p10.s[w12, #1]\n"
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
+ ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
"mov x26, %x[in]\n"
+ "ldr x25, [x26, #0x0]\n"
+ ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe09606e1 // ld1w { za0h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+ "ldr x23, [x26, #0x8]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"whilelt p9.s, x27, %x[width]\n"
- ".inst 0xe0960f00 // ld1w { za0h.s[x12] }, p3/Z, [x24, x22, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
"subs x20, x20, #0x1\n"
- "incw x27\n"
- ".inst 0xe0960ae1 // ld1w { za0h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n"
- "ldr x23, [x26, #0x8]\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
"add x26, x26, #0x10\n"
".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
"addvl x21, x21, #2\n"
+ "incw x27\n"
"incw x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x26, %x[in]\n"
"whilelt p8.s, x27, %x[width]\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
- "addvl x21, x21, #1\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"ldr x20, [x26, #0x0]\n"
- "add x26, x26, #0x8\n"
+ ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
".inst 0xe0960288 // ld1w { za2h.s[x12] }, p0/Z, [x20, x22, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x10\n"
+ "add x26, x26, #0x8\n"
+ "addvl x21, x21, #1\n"
"blt 9b\n"
"whilelt p9.s, x27, %x[width]\n"
"whilelt p8.s, x27, %x[width]\n"
@@ -180,8 +180,8 @@ void interleave_block<1, 1, VLType::SME, false>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x9\n"
+ "addvl x21, x21, #1\n"
"blt 10b\n"
"whilelt p8.s, x27, %x[width]\n"
"b 13f\n"
@@ -191,15 +191,15 @@ void interleave_block<1, 1, VLType::SME, false>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x21, x21, #1\n"
"cmp x12, x9\n"
+ "addvl x21, x21, #1\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
index a07831e7bd..1ed835b21b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
@@ -32,66 +32,66 @@ void interleave_block<2, 1, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x28, #0x0\n"
- "mov x27, %x[row_offset]\n"
- "cnth x26\n"
- "cnth x25\n"
- "cmp %x[height], x26\n"
+ "cnth x28\n"
+ "cmp %x[height], x28\n"
+ "cnth x27\n"
+ "csel x28, %x[height], x28, LT\n"
+ "mov x26, #0x0\n"
"ptrue p13.s\n"
- "csel x26, %x[height], x26, LT\n"
+ "sub x28, x28, #0x1\n"
"whilelt p12.h, XZR, %x[height]\n"
- "sub x26, x26, #0x1\n"
- "whilelt p11.h, x25, %x[height]\n"
+ "whilelt p11.h, x27, %x[height]\n"
+ "mov x25, %x[row_offset]\n"
"mov x24, %x[out]\n"
- "whilelt p10.h, x28, %x[width]\n"
- "whilelt p9.h, x28, %x[width]\n"
- "whilelt p8.h, x28, %x[width]\n"
+ "whilelt p10.h, x26, %x[width]\n"
+ "whilelt p9.h, x26, %x[width]\n"
+ "whilelt p8.h, x26, %x[width]\n"
"1:" // Width loop
"add x23, %x[in], XZR, LSL #3\n"
- "add x20, %x[in], x25, LSL #3\n"
- "mov x13, #0x0\n"
+ "add x20, %x[in], x27, LSL #3\n"
"ldr x22, [x23], #0x8\n"
+ "mov x12, #0x0\n"
"ldr x21, [x20], #0x8\n"
- "cbz x26, 3f\n"
+ "cbz x28, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0xe05b26c0 // ld1h { za0h.h[x13] }, p1/Z, [x22, x27, LSL #1]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe05906c0 // ld1h { za0h.h[x12] }, p1/Z, [x22, x25, LSL #1]\n"
"ldr x22, [x23], #0x8\n"
- ".inst 0xe05b22a8 // ld1h { za1h.h[x13] }, p0/Z, [x21, x27, LSL #1]\n"
- "add x13, x13, #0x2\n"
+ ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28, LSL #1\n"
"ldr x21, [x20], #0x8\n"
- "cmp x13, x26, LSL #1\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- "sub x20, %x[width], x28\n"
+ "sub x20, %x[width], x26\n"
+ ".inst 0x25286580 // psel p0.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0xe05902c0 // ld1h { za0h.h[x12] }, p0/Z, [x22, x25, LSL #1]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ "cmp x20, x27\n"
+ ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
"mov x12, #0x0\n"
- "cmp x20, x25\n"
- ".inst 0xe05b26c0 // ld1h { za0h.h[x13] }, p1/Z, [x22, x27, LSL #1]\n"
- "csel x20, x20, x25, LT\n"
- ".inst 0xe05b22a8 // ld1h { za1h.h[x13] }, p0/Z, [x21, x27, LSL #1]\n"
+ "csel x20, x20, x27, LT\n"
"4:" // Stores: Loop
- ".inst 0x25287541 // psel p1.h, p13.h/Z, p10.h[w12]\n"
".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n"
- ".inst 0xe07f8700 // st1h { za0v.h[x12] }, p1/Z, [x24, XZR, LSL #1]\n"
- ".inst 0xe0798308 // st1h { za1v.h[x12] }, p0/Z, [x24, x25, LSL #1]\n"
+ ".inst 0xe07f8300 // st1h { za0v.h[x12] }, p0/Z, [x24, XZR, LSL #1]\n"
+ ".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n"
+ ".inst 0xe07b8308 // st1h { za1v.h[x12] }, p0/Z, [x24, x27, LSL #1]\n"
"add x12, x12, #0x1\n"
- "addvl x24, x24, #4\n"
"cmp x12, x20\n"
+ "addvl x24, x24, #4\n"
"blt 4b\n"
- "inch x28\n"
- "inch x27\n"
- "whilelt p10.h, x28, %x[width]\n"
- "whilelt p9.h, x28, %x[width]\n"
- "whilelt p8.h, x28, %x[width]\n"
+ "inch x26\n"
+ "whilelt p10.h, x26, %x[width]\n"
+ "whilelt p9.h, x26, %x[width]\n"
+ "whilelt p8.h, x26, %x[width]\n"
+ "inch x25\n"
"b.any 1b\n"
"mov %x[out], x24\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
index 01dfecc4ef..715810ddea 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
@@ -32,264 +32,269 @@ void interleave_block<2, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x22, %x[width]\n"
+ "cnth x22\n"
"mov x21, %x[width]\n"
- "cnth x20\n"
- "inch x22\n"
- "sub x7, x20, #0x1\n"
- "sub x22, x22, #0x1\n"
- "ands x7, x21, x7\n"
- "cntw x8\n"
- "udiv x22, x22, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x7, x7, x20, NE\n"
- "sub x13, x22, #0x1\n"
- "add x7, x7, #0x1\n"
- "sub x17, x8, #0x2\n"
- "lsl x21, %x[height], #0x1\n" // height * 2
- "lsl x20, x8, #0x1\n"
- "mov x16, #0x0\n"
+ "inch x21\n"
+ "mov x20, %x[width]\n"
+ "sub x17, x22, #0x1\n"
+ "sub x21, x21, #0x1\n"
+ "ands x17, x20, x17\n"
+ "cntw x16\n"
+ "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x22, NE\n"
+ "sub x13, x21, #0x1\n"
+ "add x17, x17, #0x1\n"
+ "sub x15, x16, #0x2\n"
+ "lsl x22, %x[height], #0x1\n" // height * 2
+ "lsl x20, x16, #0x1\n"
+ "mov x14, #0x0\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- "cntw x9, ALL, MUL #2\n"
- "cntw x28, ALL, MUL #3\n"
- "ldr x27, [x11, #0x0]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ "cntw x28, ALL, MUL #2\n"
+ "cntw x27, ALL, MUL #3\n"
+ "ldr x26, [x10, #0x0]\n"
"lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x26, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "ldr x25, [x10, #0x0]\n"
- "lsr x7, x7, #0x1\n"
- "ptrue p12.s\n"
+ "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"ldr x24, [x11, #0x8]\n"
- "whilelt p11.h, XZR, x21\n"
- "whilelt p10.h, x20, x21\n"
+ "lsr x17, x17, #0x1\n"
+ "ptrue p13.s\n"
"ldr x21, [x10, #0x8]\n"
+ "whilelt p12.h, XZR, x22\n"
+ "whilelt p11.h, x20, x22\n"
"mov x23, %x[row_offset]\n"
"mov x22, %x[out]\n"
- "whilelt p9.h, x16, %x[width]\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p10.h, x14, %x[width]\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
"mov x12, #0x0\n"
- "cbz x17, 2f\n"
+ "cbz x15, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25286163 // psel p3.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0x25286142 // psel p2.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25686161 // psel p1.h, p8.h/Z, p11.h[w12, #2]\n"
- ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
- ".inst 0xe0570f60 // ld1h { za0h.h[x12] }, p3/Z, [x27, x23, LSL #1]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0570b28 // ld1h { za1h.h[x12] }, p2/Z, [x25, x23, LSL #1]\n"
- "ldr x25, [x10, #0x0]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+ ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
+ "ldr x26, [x10, #0x0]\n"
".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"add x12, x12, #0x4\n"
+ "cmp x12, x15, LSL #1\n"
"ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "cmp x12, x17, LSL #1\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25286163 // psel p3.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0x25286142 // psel p2.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25686161 // psel p1.h, p8.h/Z, p11.h[w12, #2]\n"
- ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+ ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0570f60 // ld1h { za0h.h[x12] }, p3/Z, [x27, x23, LSL #1]\n"
- "ldr x27, [x11, #0x0]\n"
- "inch x16\n"
- ".inst 0xe0570b28 // ld1h { za1h.h[x12] }, p2/Z, [x25, x23, LSL #1]\n"
- "ldr x25, [x10, #0x0]\n"
+ "add x10, %x[in], x16, LSL #3\n"
".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
+ "ldr x26, [x10, #0x0]\n"
+ "inch x23\n"
+ "inch x14\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "inch x23\n"
"cbz x13, 8f\n"
"mov x20, x13\n"
"3:" // K loop: Main loop
- "whilelt p8.h, x16, %x[width]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "cbz x17, 5f\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x253b6160 // psel p0.h, p8.h/Z, p11.h[w15, #1]\n"
- ".inst 0x253b6142 // psel p2.h, p8.h/Z, p10.h[w15, #1]\n"
- ".inst 0x257b6161 // psel p1.h, p8.h/Z, p11.h[w15, #3]\n"
- ".inst 0x257b6143 // psel p3.h, p8.h/Z, p10.h[w15, #3]\n"
- ".inst 0xe0576361 // ld1h { za0h.h[x15, #1] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x252a7120 // psel p0.h, p12.h/Z, p9.h[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0576b29 // ld1h { za1h.h[x15, #1] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x252a7122 // psel p2.h, p12.h/Z, p9.h[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0576703 // ld1h { za0h.h[x15, #3] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x253a7121 // psel p1.h, p12.h/Z, p9.h[w14, #1]\n"
+ ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+ ".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0576eab // ld1h { za1h.h[x15, #3] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572aab // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
"ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bfc2c0 // st1w { za0v.s[x14] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x253a7120 // psel p0.h, p12.h/Z, p9.h[w14, #1]\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
- "add x15, x15, #0x4\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "add x14, x14, #0x2\n"
+ "add x13, x13, #0x4\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x15\n"
"addvl x22, x22, #4\n"
- "cmp x14, x17\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x253b6160 // psel p0.h, p8.h/Z, p11.h[w15, #1]\n"
- ".inst 0x253b6142 // psel p2.h, p8.h/Z, p10.h[w15, #1]\n"
- ".inst 0x257b6161 // psel p1.h, p8.h/Z, p11.h[w15, #3]\n"
- ".inst 0x257b6143 // psel p3.h, p8.h/Z, p10.h[w15, #3]\n"
+ ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0576361 // ld1h { za0h.h[x15, #1] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x252a7120 // psel p0.h, p12.h/Z, p9.h[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- "mov x13, #0x0\n"
- ".inst 0xe0576b29 // ld1h { za1h.h[x15, #1] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x252a7122 // psel p2.h, p12.h/Z, p9.h[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe0576703 // ld1h { za0h.h[x15, #3] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x253a7121 // psel p1.h, p12.h/Z, p9.h[w14, #1]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+ ".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe05726ab // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0576eab // ld1h { za1h.h[x15, #3] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
"ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bfc2c0 // st1w { za0v.s[x14] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x253a7120 // psel p0.h, p12.h/Z, p9.h[w14, #1]\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
- "whilelt p9.h, x16, %x[width]\n"
- "inch x16\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+ "whilelt p10.h, x14, %x[width]\n"
+ "inch x14\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "inch x23\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"addvl x22, x22, #4\n"
- "whilelt p8.h, x16, %x[width]\n"
- "cbz x17, 7f\n"
+ "inch x23\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 7f\n"
"6:" // K loop: Main loop: Second: Loop
+ ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0x25296142 // psel p2.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
- ".inst 0x25696143 // psel p3.h, p8.h/Z, p10.h[w13, #2]\n"
- ".inst 0xe0572360 // ld1h { za0h.h[x13] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x25287120 // psel p0.h, p12.h/Z, p9.h[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0572b28 // ld1h { za1h.h[x13] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x25287122 // psel p2.h, p12.h/Z, p9.h[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0572702 // ld1h { za0h.h[x13, #2] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x25387121 // psel p1.h, p12.h/Z, p9.h[w12, #1]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+ ".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0572eaa // ld1h { za1h.h[x13, #2] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572aaa // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
"ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25387120 // psel p0.h, p12.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
"add x13, x13, #0x4\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
+ "cmp x12, x15\n"
"addvl x22, x22, #4\n"
- "cmp x12, x17\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
+ ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0x25296142 // psel p2.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
- ".inst 0x25696143 // psel p3.h, p8.h/Z, p10.h[w13, #2]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0572360 // ld1h { za0h.h[x13] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x25287120 // psel p0.h, p12.h/Z, p9.h[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0572b28 // ld1h { za1h.h[x13] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x25287122 // psel p2.h, p12.h/Z, p9.h[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0572702 // ld1h { za0h.h[x13, #2] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x25387121 // psel p1.h, p12.h/Z, p9.h[w12, #1]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+ ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe05726aa // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0572eaa // ld1h { za1h.h[x13, #2] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
"ldr x21, [x10, #0x8]\n"
".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25387120 // psel p0.h, p12.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- "whilelt p9.h, x16, %x[width]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+ "whilelt p10.h, x14, %x[width]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "inch x16\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"addvl x22, x22, #4\n"
+ "inch x14\n"
"inch x23\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x26, 11f\n"
+ "cbnz x25, 11f\n"
"mov x11, %x[in]\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25307123 // psel p3.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307122 // psel p2.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25396161 // psel p1.h, p8.h/Z, p11.h[w13, #1]\n"
- ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0xe0bf8ec0 // st1w { za0v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a88ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"ldr x21, [x11, #0x0]\n"
- "cmp x12, x8\n"
- "ldr x20, [x11, x8, LSL #0x3]\n"
- "add x11, x11, #0x8\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
+ ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ "cmp x12, x16\n"
".inst 0xe05726a1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
".inst 0xe0572289 // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
+ "add x11, x11, #0x8\n"
+ "addvl x22, x22, #2\n"
"add x13, x13, #0x2\n"
"blt 9b\n"
- "whilelt p9.h, x16, %x[width]\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p10.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
- ".inst 0x25307121 // psel p1.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x2\n"
- ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882cc // st1w { za3v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"addvl x22, x22, #2\n"
- "cmp x12, x7\n"
+ "add x20, x20, #0x2\n"
"blt 10b\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
- ".inst 0x25307121 // psel p1.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"addvl x22, x22, #2\n"
- "cmp x12, x7\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x22\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
index b1e9226773..849f6c3228 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
@@ -32,264 +32,269 @@ void interleave_block<2, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x22, %x[width]\n"
+ "cnth x22\n"
"mov x21, %x[width]\n"
- "cnth x20\n"
- "inch x22\n"
- "sub x7, x20, #0x1\n"
- "sub x22, x22, #0x1\n"
- "ands x7, x21, x7\n"
- "cntw x8\n"
- "udiv x22, x22, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x7, x7, x20, NE\n"
- "sub x13, x22, #0x1\n"
- "add x7, x7, #0x1\n"
- "sub x17, x8, #0x2\n"
- "lsl x21, %x[height], #0x1\n" // height * 2
- "lsl x20, x8, #0x1\n"
- "mov x16, #0x0\n"
+ "inch x21\n"
+ "mov x20, %x[width]\n"
+ "sub x17, x22, #0x1\n"
+ "sub x21, x21, #0x1\n"
+ "ands x17, x20, x17\n"
+ "cntw x16\n"
+ "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x22, NE\n"
+ "sub x13, x21, #0x1\n"
+ "add x17, x17, #0x1\n"
+ "sub x15, x16, #0x2\n"
+ "lsl x22, %x[height], #0x1\n" // height * 2
+ "lsl x20, x16, #0x1\n"
+ "mov x14, #0x0\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- "cntw x9, ALL, MUL #2\n"
- "cntw x28, ALL, MUL #3\n"
- "ldr x27, [x11, #0x0]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ "cntw x28, ALL, MUL #2\n"
+ "cntw x27, ALL, MUL #3\n"
+ "ldr x26, [x10, #0x0]\n"
"lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x26, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "ldr x25, [x10, #0x0]\n"
- "lsr x7, x7, #0x1\n"
- "ptrue p12.s\n"
+ "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"ldr x24, [x11, #0x8]\n"
- "whilelt p11.h, XZR, x21\n"
- "whilelt p10.h, x20, x21\n"
+ "lsr x17, x17, #0x1\n"
+ "ptrue p13.s\n"
"ldr x21, [x10, #0x8]\n"
+ "whilelt p12.h, XZR, x22\n"
+ "whilelt p11.h, x20, x22\n"
"mov x23, %x[row_offset]\n"
"mov x22, %x[out]\n"
- "whilelt p9.h, x16, %x[width]\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p10.h, x14, %x[width]\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
"mov x12, #0x0\n"
- "cbz x17, 2f\n"
+ "cbz x15, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25286163 // psel p3.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0x25286142 // psel p2.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25686161 // psel p1.h, p8.h/Z, p11.h[w12, #2]\n"
- ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
- ".inst 0xe0570f60 // ld1h { za0h.h[x12] }, p3/Z, [x27, x23, LSL #1]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0570b28 // ld1h { za1h.h[x12] }, p2/Z, [x25, x23, LSL #1]\n"
- "ldr x25, [x10, #0x0]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+ ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
+ "ldr x26, [x10, #0x0]\n"
".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"add x12, x12, #0x4\n"
+ "cmp x12, x15, LSL #1\n"
"ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "cmp x12, x17, LSL #1\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25286163 // psel p3.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0x25286142 // psel p2.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0x25686161 // psel p1.h, p8.h/Z, p11.h[w12, #2]\n"
- ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+ ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0570f60 // ld1h { za0h.h[x12] }, p3/Z, [x27, x23, LSL #1]\n"
- "ldr x27, [x11, #0x0]\n"
- "inch x16\n"
- ".inst 0xe0570b28 // ld1h { za1h.h[x12] }, p2/Z, [x25, x23, LSL #1]\n"
- "ldr x25, [x10, #0x0]\n"
+ "add x10, %x[in], x16, LSL #3\n"
".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
+ "ldr x26, [x10, #0x0]\n"
+ "inch x23\n"
+ "inch x14\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "inch x23\n"
"cbz x13, 8f\n"
"mov x20, x13\n"
"3:" // K loop: Main loop
- "whilelt p8.h, x16, %x[width]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "cbz x17, 5f\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x253b6160 // psel p0.h, p8.h/Z, p11.h[w15, #1]\n"
- ".inst 0x253b6142 // psel p2.h, p8.h/Z, p10.h[w15, #1]\n"
- ".inst 0x257b6161 // psel p1.h, p8.h/Z, p11.h[w15, #3]\n"
- ".inst 0x257b6143 // psel p3.h, p8.h/Z, p10.h[w15, #3]\n"
- ".inst 0xe0576361 // ld1h { za0h.h[x15, #1] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x252a7120 // psel p0.h, p12.h/Z, p9.h[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0576b29 // ld1h { za1h.h[x15, #1] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x252a7122 // psel p2.h, p12.h/Z, p9.h[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0576703 // ld1h { za0h.h[x15, #3] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x253a7121 // psel p1.h, p12.h/Z, p9.h[w14, #1]\n"
+ ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+ ".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0576eab // ld1h { za1h.h[x15, #3] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572aab // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
"ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bfc2c0 // st1w { za0v.s[x14] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x253a7120 // psel p0.h, p12.h/Z, p9.h[w14, #1]\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
- "add x15, x15, #0x4\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "add x14, x14, #0x2\n"
+ "add x13, x13, #0x4\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x15\n"
"addvl x22, x22, #4\n"
- "cmp x14, x17\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x253b6160 // psel p0.h, p8.h/Z, p11.h[w15, #1]\n"
- ".inst 0x253b6142 // psel p2.h, p8.h/Z, p10.h[w15, #1]\n"
- ".inst 0x257b6161 // psel p1.h, p8.h/Z, p11.h[w15, #3]\n"
- ".inst 0x257b6143 // psel p3.h, p8.h/Z, p10.h[w15, #3]\n"
+ ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0576361 // ld1h { za0h.h[x15, #1] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x252a7120 // psel p0.h, p12.h/Z, p9.h[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- "mov x13, #0x0\n"
- ".inst 0xe0576b29 // ld1h { za1h.h[x15, #1] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x252a7122 // psel p2.h, p12.h/Z, p9.h[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe0576703 // ld1h { za0h.h[x15, #3] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x253a7121 // psel p1.h, p12.h/Z, p9.h[w14, #1]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+ ".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe05726ab // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0576eab // ld1h { za1h.h[x15, #3] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
"ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bfc2c0 // st1w { za0v.s[x14] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x253a7120 // psel p0.h, p12.h/Z, p9.h[w14, #1]\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
- "whilelt p9.h, x16, %x[width]\n"
- "inch x16\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+ "whilelt p10.h, x14, %x[width]\n"
+ "inch x14\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "inch x23\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"addvl x22, x22, #4\n"
- "whilelt p8.h, x16, %x[width]\n"
- "cbz x17, 7f\n"
+ "inch x23\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 7f\n"
"6:" // K loop: Main loop: Second: Loop
+ ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0x25296142 // psel p2.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
- ".inst 0x25696143 // psel p3.h, p8.h/Z, p10.h[w13, #2]\n"
- ".inst 0xe0572360 // ld1h { za0h.h[x13] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x25287120 // psel p0.h, p12.h/Z, p9.h[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0572b28 // ld1h { za1h.h[x13] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x25287122 // psel p2.h, p12.h/Z, p9.h[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0572702 // ld1h { za0h.h[x13, #2] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x25387121 // psel p1.h, p12.h/Z, p9.h[w12, #1]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
+ ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+ ".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0572eaa // ld1h { za1h.h[x13, #2] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0572aaa // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
"ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25387120 // psel p0.h, p12.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
"add x13, x13, #0x4\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
+ "cmp x12, x15\n"
"addvl x22, x22, #4\n"
- "cmp x12, x17\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
+ ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0x25296142 // psel p2.h, p8.h/Z, p10.h[w13]\n"
- ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
- ".inst 0x25696143 // psel p3.h, p8.h/Z, p10.h[w13, #2]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0572360 // ld1h { za0h.h[x13] }, p0/Z, [x27, x23, LSL #1]\n"
- ".inst 0x25287120 // psel p0.h, p12.h/Z, p9.h[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0572b28 // ld1h { za1h.h[x13] }, p2/Z, [x25, x23, LSL #1]\n"
- ".inst 0x25287122 // psel p2.h, p12.h/Z, p9.h[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0572702 // ld1h { za0h.h[x13, #2] }, p1/Z, [x24, x23, LSL #1]\n"
- ".inst 0x25387121 // psel p1.h, p12.h/Z, p9.h[w12, #1]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+ ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe05726aa // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0572eaa // ld1h { za1h.h[x13, #2] }, p3/Z, [x21, x23, LSL #1]\n"
+ ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
"ldr x21, [x10, #0x8]\n"
".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25387120 // psel p0.h, p12.h/Z, p9.h[w12, #1]\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- "whilelt p9.h, x16, %x[width]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+ "whilelt p10.h, x14, %x[width]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "inch x16\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"addvl x22, x22, #4\n"
+ "inch x14\n"
"inch x23\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x26, 11f\n"
+ "cbnz x25, 11f\n"
"mov x11, %x[in]\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25307123 // psel p3.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307122 // psel p2.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25396161 // psel p1.h, p8.h/Z, p11.h[w13, #1]\n"
- ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
- ".inst 0xe0bf8ec0 // st1w { za0v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a88ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"ldr x21, [x11, #0x0]\n"
- "cmp x12, x8\n"
- "ldr x20, [x11, x8, LSL #0x3]\n"
- "add x11, x11, #0x8\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
+ ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ "cmp x12, x16\n"
".inst 0xe05726a1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
".inst 0xe0572289 // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
+ "add x11, x11, #0x8\n"
+ "addvl x22, x22, #2\n"
"add x13, x13, #0x2\n"
"blt 9b\n"
- "whilelt p9.h, x16, %x[width]\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p10.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
- ".inst 0x25307121 // psel p1.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x2\n"
- ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882cc // st1w { za3v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"addvl x22, x22, #2\n"
- "cmp x12, x7\n"
+ "add x20, x20, #0x2\n"
"blt 10b\n"
- "whilelt p8.h, x16, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
- ".inst 0x25307121 // psel p1.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"addvl x22, x22, #2\n"
- "cmp x12, x7\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x22\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
index 7b42b6fb93..2d6e1ce6c7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
@@ -32,265 +32,265 @@ void interleave_block<2, 4, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
"incb x23\n"
- "sub x7, x20, #0x1\n"
- "cntw x8\n"
+ "mov x20, %x[width]\n"
+ "sub x17, x21, #0x1\n"
+ "cntw x16\n"
"sub x23, x23, #0x1\n"
- "ands x7, x21, x7\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x7, x7, x20, NE\n"
+ "ands x17, x20, x17\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
- "lsl x21, x8, #0x1\n"
+ "lsl x21, x16, #0x1\n"
"sub x20, x23, #0x1\n"
- "add x7, x7, #0x3\n"
- "sub x17, x8, #0x2\n"
+ "add x17, x17, #0x3\n"
+ "sub x15, x16, #0x2\n"
"whilelt p9.b, XZR, x22\n"
"whilelt p8.b, x21, x22\n"
- "mov x16, #0x0\n"
+ "mov x14, #0x0\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- "cntw x9, ALL, MUL #2\n"
- "cntw x28, ALL, MUL #3\n"
- "ldr x27, [x11, #0x0]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ "cntw x28, ALL, MUL #2\n"
+ "cntw x27, ALL, MUL #3\n"
+ "ldr x26, [x10, #0x0]\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x26, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "ldr x25, [x10, #0x0]\n"
- "lsr x7, x7, #0x2\n"
- "ptrue p11.s\n"
+ "and x25, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"ldr x24, [x11, #0x8]\n"
+ "lsr x17, x17, #0x2\n"
+ "ptrue p11.s\n"
+ "ldr x23, [x10, #0x8]\n"
"zip1 p10.b, p9.b, p8.b\n"
- "mov x23, %x[row_offset]\n"
- "ldr x21, [x10, #0x8]\n"
- "mov x22, %x[out]\n"
- "whilelt p9.b, x16, %x[width]\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "mov x22, %x[row_offset]\n"
+ "mov x21, %x[out]\n"
+ "whilelt p9.b, x14, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
"mov x12, #0x0\n"
- "cbz x17, 2f\n"
+ "cbz x15, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
- ".inst 0xe0170f60 // ld1b { za0h.b[x12] }, p3/Z, [x27, x23]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0170b21 // ld1b { za0h.b[x12, #1] }, p2/Z, [x25, x23]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0170704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x23]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01702a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x23]\n"
+ ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
"add x12, x12, #0x8\n"
- "ldr x21, [x10, #0x8]\n"
+ "cmp x12, x15, LSL #2\n"
+ "ldr x23, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "cmp x12, x17, LSL #2\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+ ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0170f60 // ld1b { za0h.b[x12] }, p3/Z, [x27, x23]\n"
- "ldr x27, [x11, #0x0]\n"
- "incb x16\n"
- ".inst 0xe0170b21 // ld1b { za0h.b[x12, #1] }, p2/Z, [x25, x23]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0170704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x23]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
+ "ldr x26, [x10, #0x0]\n"
+ "incb x22\n"
+ "incb x14\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01702a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
+ "ldr x23, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "incb x23\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
"3:" // K loop: Main loop
- "whilelt p8.b, x16, %x[width]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "cbz x17, 5f\n"
+ "whilelt p8.b, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
- ".inst 0xe0176f62 // ld1b { za0h.b[x15, #2] }, p3/Z, [x27, x23]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0176b23 // ld1b { za0h.b[x15, #3] }, p2/Z, [x25, x23]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0176706 // ld1b { za0h.b[x15, #6] }, p1/Z, [x24, x23]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0162ae7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x23, x22]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01762a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bfcec0 // st1w { za0v.s[x14] }, p3/Z, [x22, XZR, LSL #2]\n"
- "add x15, x15, #0x8\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "add x14, x14, #0x2\n"
- "addvl x22, x22, #4\n"
- "cmp x14, x17\n"
+ "add x13, x13, #0x8\n"
+ ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x15\n"
+ "addvl x21, x21, #4\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0176f62 // ld1b { za0h.b[x15, #2] }, p3/Z, [x27, x23]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- "mov x13, #0x0\n"
- ".inst 0xe0176b23 // ld1b { za0h.b[x15, #3] }, p2/Z, [x25, x23]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe0176706 // ld1b { za0h.b[x15, #6] }, p1/Z, [x24, x23]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ ".inst 0x257d6141 // psel p1.b, p8.b/Z, p10.b[w13, #7]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e7 // ld1b { za0h.b[x13, #7] }, p1/Z, [x23, x22]\n"
"ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ "whilelt p9.b, x14, %x[width]\n"
+ "incb x14\n"
+ ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01762a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- "whilelt p9.b, x16, %x[width]\n"
- ".inst 0xe0bfcec0 // st1w { za0v.s[x14] }, p3/Z, [x22, XZR, LSL #2]\n"
- "incb x16\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
- "incb x23\n"
- "whilelt p8.b, x16, %x[width]\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "addvl x22, x22, #4\n"
- "cbz x17, 7f\n"
+ ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ "addvl x21, x21, #4\n"
+ "incb x22\n"
+ "whilelt p8.b, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
- ".inst 0xe0172f60 // ld1b { za0h.b[x13] }, p3/Z, [x27, x23]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0172b21 // ld1b { za0h.b[x13, #1] }, p2/Z, [x25, x23]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0172704 // ld1b { za0h.b[x13, #4] }, p1/Z, [x24, x23]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0162ae5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x23, x22]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01722a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bf8ec8 // st1w { za2v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
"add x13, x13, #0x8\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
- "addvl x22, x22, #4\n"
- "cmp x12, x17\n"
+ "cmp x12, x15\n"
+ "addvl x21, x21, #4\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+ ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0172f60 // ld1b { za0h.b[x13] }, p3/Z, [x27, x23]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0172b21 // ld1b { za0h.b[x13, #1] }, p2/Z, [x25, x23]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0172704 // ld1b { za0h.b[x13, #4] }, p1/Z, [x24, x23]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ ".inst 0x256d6141 // psel p1.b, p8.b/Z, p10.b[w13, #5]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e5 // ld1b { za0h.b[x13, #5] }, p1/Z, [x23, x22]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe01722a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- "whilelt p9.b, x16, %x[width]\n"
- ".inst 0xe0bf8ec8 // st1w { za2v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ "whilelt p9.b, x14, %x[width]\n"
"subs x20, x20, #0x1\n"
+ ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- "incb x16\n"
- "incb x23\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "addvl x22, x22, #4\n"
+ ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ "addvl x21, x21, #4\n"
+ "incb x14\n"
+ "incb x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x26, 11f\n"
+ "cbnz x25, 11f\n"
"mov x11, %x[in]\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- ".inst 0xe0bf8ec0 // st1w { za0v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a88ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ "ldr x20, [x11, #0x0]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
"add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
- "ldr x21, [x11, #0x0]\n"
- "cmp x12, x8\n"
- "ldr x20, [x11, x8, LSL #0x3]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe0162283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
+ "cmp x12, x16\n"
"add x11, x11, #0x8\n"
- ".inst 0xe01726a2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x21, x23]\n"
- ".inst 0xe0172283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x23]\n"
+ "addvl x21, x21, #2\n"
"add x13, x13, #0x4\n"
"blt 9b\n"
- "whilelt p9.b, x16, %x[width]\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "whilelt p9.b, x14, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x4\n"
- ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882cc // st1w { za3v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
- "cmp x12, x7\n"
+ "cmp x12, x17\n"
+ "addvl x21, x21, #2\n"
+ "add x20, x20, #0x4\n"
"blt 10b\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
- "cmp x12, x7\n"
+ "cmp x12, x17\n"
+ "addvl x21, x21, #2\n"
"blt 12b\n"
"13:" // K loop: End
- "mov %x[out], x22\n"
+ "mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
index 6930bf4056..27b9bc3806 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
@@ -32,321 +32,321 @@ void interleave_block<2, 4, VLType::SME, true>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
- "incb x23\n"
"mov z20.b, #0x1\n"
+ "incb x23\n"
+ "mov x20, %x[width]\n"
"mov z19.s, #0x0\n"
- "sub x7, x20, #0x1\n"
- "cntw x8\n"
"mov z18.s, #0x0\n"
+ "sub x17, x21, #0x1\n"
+ "cntw x16\n"
"sub x23, x23, #0x1\n"
- "ands x7, x21, x7\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x7, x7, x20, NE\n"
+ "ands x17, x20, x17\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
- "lsl x21, x8, #0x1\n"
+ "lsl x21, x16, #0x1\n"
"sub x20, x23, #0x1\n"
- "add x7, x7, #0x3\n"
+ "add x17, x17, #0x3\n"
"whilelt p9.b, XZR, x22\n"
"whilelt p8.b, x21, x22\n"
- "mov x17, #0x0\n"
- "cntw x16, ALL, MUL #2\n"
+ "mov x15, #0x0\n"
+ "cntw x14, ALL, MUL #2\n"
"cntw x11, ALL, MUL #3\n"
"ptrue p4.b\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
"and x10, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "lsr x7, x7, #0x2\n"
- "sub x9, x8, #0x2\n"
+ "lsr x17, x17, #0x2\n"
+ "sub x9, x16, #0x2\n"
"ptrue p11.s\n"
"zip1 p10.b, p9.b, p8.b\n"
"mov x28, %x[row_offset]\n"
"mov x27, %x[out]\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"cbnz %x[first], 1f\n"
"addvl x27, x27, #-2\n"
"ld1w { z19.s }, p4/Z, [x27]\n"
"ld1w { z18.s }, p4/Z, [x27, #1, MUL VL]\n"
"1:" // K loop: Load row sums: End
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- "mov x12, #0x0\n"
"ldr x23, [x25, #0x0]\n"
+ "mov x12, #0x0\n"
"ldr x22, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
"ldr x21, [x25, #0x8]\n"
"add x25, x25, #0x10\n"
"cbz x9, 3f\n"
"2:" // K loop: Charge: Loop
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x24, [x26, #0x0]\n"
+ ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
- ".inst 0xe01c0f00 // ld1b { za0h.b[x12] }, p3/Z, [x24, x28]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c0ae1 // ld1b { za0h.b[x12, #1] }, p2/Z, [x23, x28]\n"
"ldr x23, [x25, #0x0]\n"
".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
"ldr x22, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
"add x12, x12, #0x8\n"
+ "cmp x12, x9, LSL #2\n"
"ldr x21, [x25, #0x8]\n"
"add x25, x25, #0x10\n"
- "cmp x12, x9, LSL #2\n"
"blt 2b\n"
"3:" // K loop: Charge: End
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+ ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
- ".inst 0xe01c0f00 // ld1b { za0h.b[x12] }, p3/Z, [x24, x28]\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- "incb x17\n"
- ".inst 0xe01c0ae1 // ld1b { za0h.b[x12, #1] }, p2/Z, [x23, x28]\n"
+ ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
+ "incb x28\n"
+ "incb x15\n"
"ldr x22, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
"add x25, x25, #0x10\n"
- "incb x28\n"
"cbz x20, 9f\n"
"mov x20, x20\n"
"4:" // K loop: Main loop
- "whilelt p8.b, x17, %x[width]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
+ "whilelt p8.b, x15, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x9, 6f\n"
"5:" // K loop: Main loop: First: Loop
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
- ".inst 0xe01c6f02 // ld1b { za0h.b[x15, #2] }, p3/Z, [x24, x28]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c6ae3 // ld1b { za0h.b[x15, #3] }, p2/Z, [x23, x28]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
+ ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c66c6 // ld1b { za0h.b[x15, #6] }, p1/Z, [x22, x28]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0xe01c22c6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x22, x28]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c62a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- "add x25, x25, #0x10\n"
- ".inst 0xe0bfcf60 // st1w { za0v.s[x14] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc082d011 // mova z17.s, p4/M, za0v.s[x14]\n"
- "add x15, x15, #0x8\n"
- ".inst 0xc082d090 // mova z16.s, p4/M, za1v.s[x14]\n"
+ ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
"sdot z19.s, z17.b, z20.b\n"
- ".inst 0xe0a8cb64 // st1w { za1v.s[x14] }, p2/Z, [x27, x8, LSL #2]\n"
+ ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0b0c761 // st1w { za0v.s[x14, #1] }, p1/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc082d031 // mova z17.s, p4/M, za0v.s[x14, #1]\n"
- ".inst 0xe0abc365 // st1w { za1v.s[x14, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc082d0b0 // mova z16.s, p4/M, za1v.s[x14, #1]\n"
- "add x14, x14, #0x2\n"
- "addvl x27, x27, #4\n"
- "cmp x14, x9\n"
+ ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+ ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x9\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"sdot z19.s, z17.b, z20.b\n"
"sdot z18.s, z16.b, z20.b\n"
+ "addvl x27, x27, #4\n"
+ "add x13, x13, #0x8\n"
"blt 5b\n"
"6:" // K loop: Main loop: First: Tail
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x257d6140 // psel p0.b, p8.b/Z, p10.b[w13, #7]\n"
+ ".inst 0xe01c26c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x28]\n"
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
- ".inst 0xe01c6f02 // ld1b { za0h.b[x15, #2] }, p3/Z, [x24, x28]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- "mov x13, #0x0\n"
- ".inst 0xe01c6ae3 // ld1b { za0h.b[x15, #3] }, p2/Z, [x23, x28]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
+ ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
+ ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ "sdot z18.s, z16.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe01c66c6 // ld1b { za0h.b[x15, #6] }, p1/Z, [x22, x28]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c62a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- "whilelt p9.b, x17, %x[width]\n"
- ".inst 0xc082d011 // mova z17.s, p4/M, za0v.s[x14]\n"
- ".inst 0xe0bfcf60 // st1w { za0v.s[x14] }, p3/Z, [x27, XZR, LSL #2]\n"
- "incb x17\n"
- "add x25, x25, #0x10\n"
- ".inst 0xc082d090 // mova z16.s, p4/M, za1v.s[x14]\n"
+ ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
+ "incb x15\n"
+ "add x26, x26, #0x10\n"
"sdot z19.s, z17.b, z20.b\n"
- ".inst 0xe0a8cb64 // st1w { za1v.s[x14] }, p2/Z, [x27, x8, LSL #2]\n"
- "incb x28\n"
+ ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+ "add x25, x25, #0x10\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0b0c761 // st1w { za0v.s[x14, #1] }, p1/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc082d031 // mova z17.s, p4/M, za0v.s[x14, #1]\n"
- "whilelt p8.b, x17, %x[width]\n"
- ".inst 0xc082d0b0 // mova z16.s, p4/M, za1v.s[x14, #1]\n"
- ".inst 0xe0abc365 // st1w { za1v.s[x14, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+ "incb x28\n"
+ ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
- "sdot z19.s, z17.b, z20.b\n"
- "sdot z18.s, z16.b, z20.b\n"
+ "whilelt p8.b, x15, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x9, 8f\n"
"7:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
- ".inst 0xe01c2f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x28]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c2ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x28]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
+ ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0xe01c22c4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x22, x28]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- "add x25, x25, #0x10\n"
- ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
- "add x13, x13, #0x8\n"
".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
"sdot z19.s, z17.b, z20.b\n"
- ".inst 0xe0a88b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x8, LSL #2]\n"
+ ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0b08769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x16, LSL #2]\n"
+ ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
- "addvl x27, x27, #4\n"
"cmp x12, x9\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"sdot z19.s, z17.b, z20.b\n"
"sdot z18.s, z16.b, z20.b\n"
+ "addvl x27, x27, #4\n"
+ "add x13, x13, #0x8\n"
"blt 7b\n"
"8:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+ ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
+ ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
- ".inst 0xe01c2f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x28]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c2ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x28]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
+ ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+ ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ "sdot z18.s, z16.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- "whilelt p9.b, x17, %x[width]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"subs x20, x20, #0x1\n"
- "add x25, x25, #0x10\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ "add x26, x26, #0x10\n"
"sdot z19.s, z17.b, z20.b\n"
- ".inst 0xe0a88b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x8, LSL #2]\n"
- "incb x17\n"
+ ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+ "add x25, x25, #0x10\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0b08769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
- "incb x28\n"
- ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+ "incb x15\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
- "sdot z19.s, z17.b, z20.b\n"
- "sdot z18.s, z16.b, z20.b\n"
+ "incb x28\n"
"bgt 4b\n"
"9:" // K loop: Tails
"cbnz x10, 12f\n"
"mov x26, %x[in]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: First
- ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+ "ldr x21, [x26, #0x0]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ "ldr x20, [x26, x16, LSL #0x3]\n"
+ ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ "cmp x12, x16\n"
"sdot z19.s, z17.b, z20.b\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xe0a88b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x27, x27, #2\n"
- "ldr x21, [x26, #0x0]\n"
- "cmp x12, x8\n"
- "ldr x20, [x26, x8, LSL #0x3]\n"
- "add x26, x26, #0x8\n"
- ".inst 0xe01c26a2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x21, x28]\n"
".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
+ "add x26, x26, #0x8\n"
+ "addvl x27, x27, #2\n"
"add x13, x13, #0x4\n"
"blt 10b\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"11:" // K loop: Tails: Even: Second
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"sdot z19.s, z17.b, z20.b\n"
- "add x20, x20, #0x4\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xe0a8836c // st1w { za3v.s[x12] }, p0/Z, [x27, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
"addvl x27, x27, #2\n"
- "cmp x12, x7\n"
+ "add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"13:" // K loop: Tails: Odd: Loop
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"sdot z19.s, z17.b, z20.b\n"
"sdot z18.s, z16.b, z20.b\n"
- ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xe0a88364 // st1w { za1v.s[x12] }, p0/Z, [x27, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
"addvl x27, x27, #2\n"
- "cmp x12, x7\n"
"blt 13b\n"
"14:" // K loop: End
"st1w { z19.s }, p4, [x27]\n"
"st1w { z18.s }, p4, [x27, #1, MUL VL]\n"
"addvl x27, x27, #2\n"
- ".inst 0xd503467f // SMSTOP\n"
"mov %x[out], x27\n"
+ ".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
index 9ce93ed95c..3f3863720a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
@@ -32,265 +32,265 @@ void interleave_block<2, 4, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
"incb x23\n"
- "sub x7, x20, #0x1\n"
- "cntw x8\n"
+ "mov x20, %x[width]\n"
+ "sub x17, x21, #0x1\n"
+ "cntw x16\n"
"sub x23, x23, #0x1\n"
- "ands x7, x21, x7\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x7, x7, x20, NE\n"
+ "ands x17, x20, x17\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
- "lsl x21, x8, #0x1\n"
+ "lsl x21, x16, #0x1\n"
"sub x20, x23, #0x1\n"
- "add x7, x7, #0x3\n"
- "sub x17, x8, #0x2\n"
+ "add x17, x17, #0x3\n"
+ "sub x15, x16, #0x2\n"
"whilelt p9.b, XZR, x22\n"
"whilelt p8.b, x21, x22\n"
- "mov x16, #0x0\n"
+ "mov x14, #0x0\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- "cntw x9, ALL, MUL #2\n"
- "cntw x28, ALL, MUL #3\n"
- "ldr x27, [x11, #0x0]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ "cntw x28, ALL, MUL #2\n"
+ "cntw x27, ALL, MUL #3\n"
+ "ldr x26, [x10, #0x0]\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x26, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "ldr x25, [x10, #0x0]\n"
- "lsr x7, x7, #0x2\n"
- "ptrue p11.s\n"
+ "and x25, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"ldr x24, [x11, #0x8]\n"
+ "lsr x17, x17, #0x2\n"
+ "ptrue p11.s\n"
+ "ldr x23, [x10, #0x8]\n"
"zip1 p10.b, p9.b, p8.b\n"
- "mov x23, %x[row_offset]\n"
- "ldr x21, [x10, #0x8]\n"
- "mov x22, %x[out]\n"
- "whilelt p9.b, x16, %x[width]\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "mov x22, %x[row_offset]\n"
+ "mov x21, %x[out]\n"
+ "whilelt p9.b, x14, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
"mov x12, #0x0\n"
- "cbz x17, 2f\n"
+ "cbz x15, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
- ".inst 0xe0170f60 // ld1b { za0h.b[x12] }, p3/Z, [x27, x23]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0170b21 // ld1b { za0h.b[x12, #1] }, p2/Z, [x25, x23]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0170704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x23]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01702a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x23]\n"
+ ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
"add x12, x12, #0x8\n"
- "ldr x21, [x10, #0x8]\n"
+ "cmp x12, x15, LSL #2\n"
+ "ldr x23, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "cmp x12, x17, LSL #2\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+ ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0170f60 // ld1b { za0h.b[x12] }, p3/Z, [x27, x23]\n"
- "ldr x27, [x11, #0x0]\n"
- "incb x16\n"
- ".inst 0xe0170b21 // ld1b { za0h.b[x12, #1] }, p2/Z, [x25, x23]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0170704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x23]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
+ "ldr x26, [x10, #0x0]\n"
+ "incb x22\n"
+ "incb x14\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01702a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
+ "ldr x23, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
- "incb x23\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
"3:" // K loop: Main loop
- "whilelt p8.b, x16, %x[width]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "cbz x17, 5f\n"
+ "whilelt p8.b, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
- ".inst 0xe0176f62 // ld1b { za0h.b[x15, #2] }, p3/Z, [x27, x23]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0176b23 // ld1b { za0h.b[x15, #3] }, p2/Z, [x25, x23]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0176706 // ld1b { za0h.b[x15, #6] }, p1/Z, [x24, x23]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0162ae7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x23, x22]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01762a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bfcec0 // st1w { za0v.s[x14] }, p3/Z, [x22, XZR, LSL #2]\n"
- "add x15, x15, #0x8\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "add x14, x14, #0x2\n"
- "addvl x22, x22, #4\n"
- "cmp x14, x17\n"
+ "add x13, x13, #0x8\n"
+ ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x15\n"
+ "addvl x21, x21, #4\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0176f62 // ld1b { za0h.b[x15, #2] }, p3/Z, [x27, x23]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
- "ldr x27, [x11, #0x0]\n"
- "mov x13, #0x0\n"
- ".inst 0xe0176b23 // ld1b { za0h.b[x15, #3] }, p2/Z, [x25, x23]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
- "ldr x25, [x10, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe0176706 // ld1b { za0h.b[x15, #6] }, p1/Z, [x24, x23]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ ".inst 0x257d6141 // psel p1.b, p8.b/Z, p10.b[w13, #7]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e7 // ld1b { za0h.b[x13, #7] }, p1/Z, [x23, x22]\n"
"ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ "whilelt p9.b, x14, %x[width]\n"
+ "incb x14\n"
+ ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01762a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- "whilelt p9.b, x16, %x[width]\n"
- ".inst 0xe0bfcec0 // st1w { za0v.s[x14] }, p3/Z, [x22, XZR, LSL #2]\n"
- "incb x16\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0a8cac4 // st1w { za1v.s[x14] }, p2/Z, [x22, x8, LSL #2]\n"
- "incb x23\n"
- "whilelt p8.b, x16, %x[width]\n"
- ".inst 0xe0a9c6c1 // st1w { za0v.s[x14, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bcc2c5 // st1w { za1v.s[x14, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "addvl x22, x22, #4\n"
- "cbz x17, 7f\n"
+ ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ "addvl x21, x21, #4\n"
+ "incb x22\n"
+ "whilelt p8.b, x14, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "cbz x15, 7f\n"
"6:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
- ".inst 0xe0172f60 // ld1b { za0h.b[x13] }, p3/Z, [x27, x23]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0172b21 // ld1b { za0h.b[x13, #1] }, p2/Z, [x25, x23]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0172704 // ld1b { za0h.b[x13, #4] }, p1/Z, [x24, x23]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0162ae5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x23, x22]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe01722a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bf8ec8 // st1w { za2v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
"add x13, x13, #0x8\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
+ ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
- "addvl x22, x22, #4\n"
- "cmp x12, x17\n"
+ "cmp x12, x15\n"
+ "addvl x21, x21, #4\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+ ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x8, LSL #3\n"
- ".inst 0xe0172f60 // ld1b { za0h.b[x13] }, p3/Z, [x27, x23]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
- "ldr x27, [x11, #0x0]\n"
- ".inst 0xe0172b21 // ld1b { za0h.b[x13, #1] }, p2/Z, [x25, x23]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
- "ldr x25, [x10, #0x0]\n"
- ".inst 0xe0172704 // ld1b { za0h.b[x13, #4] }, p1/Z, [x24, x23]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+ "add x10, %x[in], x16, LSL #3\n"
+ ".inst 0x256d6141 // psel p1.b, p8.b/Z, p10.b[w13, #5]\n"
+ "ldr x26, [x10, #0x0]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01626e5 // ld1b { za0h.b[x13, #5] }, p1/Z, [x23, x22]\n"
"ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe01722a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x23]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- "whilelt p9.b, x16, %x[width]\n"
- ".inst 0xe0bf8ec8 // st1w { za2v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ "whilelt p9.b, x14, %x[width]\n"
"subs x20, x20, #0x1\n"
+ ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0a88acc // st1w { za3v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
- "incb x16\n"
- "incb x23\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bc82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x28, LSL #2]\n"
- "addvl x22, x22, #4\n"
+ ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ "addvl x21, x21, #4\n"
+ "incb x14\n"
+ "incb x22\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x26, 11f\n"
+ "cbnz x25, 11f\n"
"mov x11, %x[in]\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- ".inst 0xe0bf8ec0 // st1w { za0v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a88ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x8, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ "ldr x20, [x11, #0x0]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
"add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
- "ldr x21, [x11, #0x0]\n"
- "cmp x12, x8\n"
- "ldr x20, [x11, x8, LSL #0x3]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe0162283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
+ "cmp x12, x16\n"
"add x11, x11, #0x8\n"
- ".inst 0xe01726a2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x21, x23]\n"
- ".inst 0xe0172283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x23]\n"
+ "addvl x21, x21, #2\n"
"add x13, x13, #0x4\n"
"blt 9b\n"
- "whilelt p9.b, x16, %x[width]\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "whilelt p9.b, x14, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- "add x20, x20, #0x4\n"
- ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882cc // st1w { za3v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
- "cmp x12, x7\n"
+ "cmp x12, x17\n"
+ "addvl x21, x21, #2\n"
+ "add x20, x20, #0x4\n"
"blt 10b\n"
- "whilelt p8.b, x16, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0a882c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x8, LSL #2]\n"
+ ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x22, x22, #2\n"
- "cmp x12, x7\n"
+ "cmp x12, x17\n"
+ "addvl x21, x21, #2\n"
"blt 12b\n"
"13:" // K loop: End
- "mov %x[out], x22\n"
+ "mov %x[out], x21\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
index 915381334e..c8657fad1c 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
@@ -32,321 +32,321 @@ void interleave_block<2, 4, VLType::SME, true>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "cntb x21\n"
"mov x23, %x[width]\n"
- "mov x21, %x[width]\n"
- "cntb x20\n"
- "incb x23\n"
"mov z20.b, #0x1\n"
+ "incb x23\n"
+ "mov x20, %x[width]\n"
"mov z19.s, #0x0\n"
- "sub x7, x20, #0x1\n"
- "cntw x8\n"
"mov z18.s, #0x0\n"
+ "sub x17, x21, #0x1\n"
+ "cntw x16\n"
"sub x23, x23, #0x1\n"
- "ands x7, x21, x7\n"
- "udiv x23, x23, x20\n" // n_passes = ceildiv(width, VL<T>)
- "csel x7, x7, x20, NE\n"
+ "ands x17, x20, x17\n"
+ "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x21, NE\n"
"lsl x22, %x[height], #0x1\n" // height * 2
- "lsl x21, x8, #0x1\n"
+ "lsl x21, x16, #0x1\n"
"sub x20, x23, #0x1\n"
- "add x7, x7, #0x3\n"
+ "add x17, x17, #0x3\n"
"whilelt p9.b, XZR, x22\n"
"whilelt p8.b, x21, x22\n"
- "mov x17, #0x0\n"
- "cntw x16, ALL, MUL #2\n"
+ "mov x15, #0x0\n"
+ "cntw x14, ALL, MUL #2\n"
"cntw x11, ALL, MUL #3\n"
"ptrue p4.b\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
"and x10, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "lsr x7, x7, #0x2\n"
- "sub x9, x8, #0x2\n"
+ "lsr x17, x17, #0x2\n"
+ "sub x9, x16, #0x2\n"
"ptrue p11.s\n"
"zip1 p10.b, p9.b, p8.b\n"
"mov x28, %x[row_offset]\n"
"mov x27, %x[out]\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"cbnz %x[first], 1f\n"
"addvl x27, x27, #-2\n"
"ld1w { z19.s }, p4/Z, [x27]\n"
"ld1w { z18.s }, p4/Z, [x27, #1, MUL VL]\n"
"1:" // K loop: Load row sums: End
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- "mov x12, #0x0\n"
"ldr x23, [x25, #0x0]\n"
+ "mov x12, #0x0\n"
"ldr x22, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
"ldr x21, [x25, #0x8]\n"
"add x25, x25, #0x10\n"
"cbz x9, 3f\n"
"2:" // K loop: Charge: Loop
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x24, [x26, #0x0]\n"
+ ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
- ".inst 0xe01c0f00 // ld1b { za0h.b[x12] }, p3/Z, [x24, x28]\n"
- "ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c0ae1 // ld1b { za0h.b[x12, #1] }, p2/Z, [x23, x28]\n"
"ldr x23, [x25, #0x0]\n"
".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
"ldr x22, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
"add x12, x12, #0x8\n"
+ "cmp x12, x9, LSL #2\n"
"ldr x21, [x25, #0x8]\n"
"add x25, x25, #0x10\n"
- "cmp x12, x9, LSL #2\n"
"blt 2b\n"
"3:" // K loop: Charge: End
- ".inst 0x25246143 // psel p3.b, p8.b/Z, p10.b[w12]\n"
- ".inst 0x252c6142 // psel p2.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+ ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
- ".inst 0xe01c0f00 // ld1b { za0h.b[x12] }, p3/Z, [x24, x28]\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- "incb x17\n"
- ".inst 0xe01c0ae1 // ld1b { za0h.b[x12, #1] }, p2/Z, [x23, x28]\n"
+ ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
+ "incb x28\n"
+ "incb x15\n"
"ldr x22, [x26, #0x8]\n"
"add x26, x26, #0x10\n"
- ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
"add x25, x25, #0x10\n"
- "incb x28\n"
"cbz x20, 9f\n"
"mov x20, x20\n"
"4:" // K loop: Main loop
- "whilelt p8.b, x17, %x[width]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
+ "whilelt p8.b, x15, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x9, 6f\n"
"5:" // K loop: Main loop: First: Loop
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
- ".inst 0xe01c6f02 // ld1b { za0h.b[x15, #2] }, p3/Z, [x24, x28]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c6ae3 // ld1b { za0h.b[x15, #3] }, p2/Z, [x23, x28]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
+ ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c66c6 // ld1b { za0h.b[x15, #6] }, p1/Z, [x22, x28]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0xe01c22c6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x22, x28]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c62a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- "add x25, x25, #0x10\n"
- ".inst 0xe0bfcf60 // st1w { za0v.s[x14] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc082d010 // mova z16.s, p4/M, za0v.s[x14]\n"
- "add x15, x15, #0x8\n"
- ".inst 0xc082d091 // mova z17.s, p4/M, za1v.s[x14]\n"
+ ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
"udot z19.s, z16.b, z20.b\n"
- ".inst 0xe0a8cb64 // st1w { za1v.s[x14] }, p2/Z, [x27, x8, LSL #2]\n"
+ ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0b0c761 // st1w { za0v.s[x14, #1] }, p1/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc082d030 // mova z16.s, p4/M, za0v.s[x14, #1]\n"
- ".inst 0xe0abc365 // st1w { za1v.s[x14, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc082d0b1 // mova z17.s, p4/M, za1v.s[x14, #1]\n"
- "add x14, x14, #0x2\n"
- "addvl x27, x27, #4\n"
- "cmp x14, x9\n"
+ ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+ ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x9\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z19.s, z16.b, z20.b\n"
"udot z18.s, z17.b, z20.b\n"
+ "addvl x27, x27, #4\n"
+ "add x13, x13, #0x8\n"
"blt 5b\n"
"6:" // K loop: Main loop: First: Tail
- ".inst 0x25376143 // psel p3.b, p8.b/Z, p10.b[w15, #2]\n"
- ".inst 0x253f6142 // psel p2.b, p8.b/Z, p10.b[w15, #3]\n"
- ".inst 0x25776141 // psel p1.b, p8.b/Z, p10.b[w15, #6]\n"
- ".inst 0x257f6140 // psel p0.b, p8.b/Z, p10.b[w15, #7]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+ ".inst 0x257d6140 // psel p0.b, p8.b/Z, p10.b[w13, #7]\n"
+ ".inst 0xe01c26c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x28]\n"
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
- ".inst 0xe01c6f02 // ld1b { za0h.b[x15, #2] }, p3/Z, [x24, x28]\n"
- ".inst 0x25266d23 // psel p3.b, p11.b/Z, p9.b[w14]\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- "mov x13, #0x0\n"
- ".inst 0xe01c6ae3 // ld1b { za0h.b[x15, #3] }, p2/Z, [x23, x28]\n"
- ".inst 0x25266d22 // psel p2.b, p11.b/Z, p9.b[w14]\n"
+ ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
+ "udot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "udot z18.s, z17.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe01c66c6 // ld1b { za0h.b[x15, #6] }, p1/Z, [x22, x28]\n"
- ".inst 0x252e6d21 // psel p1.b, p11.b/Z, p9.b[w14, #1]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c62a7 // ld1b { za0h.b[x15, #7] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252e6d20 // psel p0.b, p11.b/Z, p9.b[w14, #1]\n"
- "whilelt p9.b, x17, %x[width]\n"
- ".inst 0xc082d010 // mova z16.s, p4/M, za0v.s[x14]\n"
- ".inst 0xe0bfcf60 // st1w { za0v.s[x14] }, p3/Z, [x27, XZR, LSL #2]\n"
- "incb x17\n"
- "add x25, x25, #0x10\n"
- ".inst 0xc082d091 // mova z17.s, p4/M, za1v.s[x14]\n"
+ ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
+ "incb x15\n"
+ "add x26, x26, #0x10\n"
"udot z19.s, z16.b, z20.b\n"
- ".inst 0xe0a8cb64 // st1w { za1v.s[x14] }, p2/Z, [x27, x8, LSL #2]\n"
- "incb x28\n"
+ ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+ "add x25, x25, #0x10\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0b0c761 // st1w { za0v.s[x14, #1] }, p1/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc082d030 // mova z16.s, p4/M, za0v.s[x14, #1]\n"
- "whilelt p8.b, x17, %x[width]\n"
- ".inst 0xc082d0b1 // mova z17.s, p4/M, za1v.s[x14, #1]\n"
- ".inst 0xe0abc365 // st1w { za1v.s[x14, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+ "incb x28\n"
+ ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
- "udot z19.s, z16.b, z20.b\n"
- "udot z18.s, z17.b, z20.b\n"
+ "whilelt p8.b, x15, %x[width]\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
"cbz x9, 8f\n"
"7:" // K loop: Main loop: Second: Loop
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
- ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
- ".inst 0xe01c2f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x28]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c2ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x28]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
+ ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
+ ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+ ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0xe01c22c4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x22, x28]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- "add x25, x25, #0x10\n"
- ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
- "add x13, x13, #0x8\n"
".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
"udot z19.s, z16.b, z20.b\n"
- ".inst 0xe0a88b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x8, LSL #2]\n"
+ ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0b08769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x16, LSL #2]\n"
+ ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
- "addvl x27, x27, #4\n"
"cmp x12, x9\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z19.s, z16.b, z20.b\n"
"udot z18.s, z17.b, z20.b\n"
+ "addvl x27, x27, #4\n"
+ "add x13, x13, #0x8\n"
"blt 7b\n"
"8:" // K loop: Main loop: Second: Tail
- ".inst 0x25256143 // psel p3.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
+ ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
+ ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+ ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+ ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
+ ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
"mov x26, %x[in]\n"
- "add x25, %x[in], x8, LSL #3\n"
- ".inst 0xe01c2f00 // ld1b { za0h.b[x13] }, p3/Z, [x24, x28]\n"
- ".inst 0x25246d23 // psel p3.b, p11.b/Z, p9.b[w12]\n"
+ "add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
- ".inst 0xe01c2ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x28]\n"
- ".inst 0x25246d22 // psel p2.b, p11.b/Z, p9.b[w12]\n"
+ ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
+ ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
+ "udot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "udot z18.s, z17.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
- ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
- ".inst 0x252c6d21 // psel p1.b, p11.b/Z, p9.b[w12, #1]\n"
+ ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
- "add x26, x26, #0x10\n"
- ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+ ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+ ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
- ".inst 0x252c6d20 // psel p0.b, p11.b/Z, p9.b[w12, #1]\n"
- "whilelt p9.b, x17, %x[width]\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"subs x20, x20, #0x1\n"
- "add x25, x25, #0x10\n"
- ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "add x26, x26, #0x10\n"
"udot z19.s, z16.b, z20.b\n"
- ".inst 0xe0a88b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x8, LSL #2]\n"
- "incb x17\n"
+ ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+ "add x25, x25, #0x10\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0b08769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
- "incb x28\n"
- ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+ "incb x15\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
- "udot z19.s, z16.b, z20.b\n"
- "udot z18.s, z17.b, z20.b\n"
+ "incb x28\n"
"bgt 4b\n"
"9:" // K loop: Tails
"cbnz x10, 12f\n"
"mov x26, %x[in]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"mov x13, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: First
- ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+ "ldr x21, [x26, #0x0]\n"
+ ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "ldr x20, [x26, x16, LSL #0x3]\n"
+ ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
+ "add x12, x12, #0x1\n"
+ ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+ "cmp x12, x16\n"
"udot z19.s, z16.b, z20.b\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xe0a88b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
- "addvl x27, x27, #2\n"
- "ldr x21, [x26, #0x0]\n"
- "cmp x12, x8\n"
- "ldr x20, [x26, x8, LSL #0x3]\n"
- "add x26, x26, #0x8\n"
- ".inst 0xe01c26a2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x21, x28]\n"
".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
+ "add x26, x26, #0x8\n"
+ "addvl x27, x27, #2\n"
"add x13, x13, #0x4\n"
"blt 10b\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"11:" // K loop: Tails: Even: Second
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"udot z19.s, z16.b, z20.b\n"
- "add x20, x20, #0x4\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xe0a8836c // st1w { za3v.s[x12] }, p0/Z, [x27, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
"addvl x27, x27, #2\n"
- "cmp x12, x7\n"
+ "add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"13:" // K loop: Tails: Odd: Loop
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x17\n"
"udot z19.s, z16.b, z20.b\n"
"udot z18.s, z17.b, z20.b\n"
- ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xe0a88364 // st1w { za1v.s[x12] }, p0/Z, [x27, x8, LSL #2]\n"
- "add x12, x12, #0x1\n"
"addvl x27, x27, #2\n"
- "cmp x12, x7\n"
"blt 13b\n"
"14:" // K loop: End
"st1w { z19.s }, p4, [x27]\n"
"st1w { z18.s }, p4, [x27, #1, MUL VL]\n"
"addvl x27, x27, #2\n"
- ".inst 0xd503467f // SMSTOP\n"
"mov %x[out], x27\n"
+ ".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
index 19d87039fe..9403efc7c6 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
@@ -32,66 +32,66 @@ void interleave_block<2, 1, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x28, #0x0\n"
- "mov x27, %x[row_offset]\n"
- "cnth x26\n"
- "cnth x25\n"
- "cmp %x[height], x26\n"
+ "cnth x28\n"
+ "cmp %x[height], x28\n"
+ "cnth x27\n"
+ "csel x28, %x[height], x28, LT\n"
+ "mov x26, #0x0\n"
"ptrue p13.s\n"
- "csel x26, %x[height], x26, LT\n"
+ "sub x28, x28, #0x1\n"
"whilelt p12.h, XZR, %x[height]\n"
- "sub x26, x26, #0x1\n"
- "whilelt p11.h, x25, %x[height]\n"
+ "whilelt p11.h, x27, %x[height]\n"
+ "mov x25, %x[row_offset]\n"
"mov x24, %x[out]\n"
- "whilelt p10.h, x28, %x[width]\n"
- "whilelt p9.h, x28, %x[width]\n"
- "whilelt p8.h, x28, %x[width]\n"
+ "whilelt p10.h, x26, %x[width]\n"
+ "whilelt p9.h, x26, %x[width]\n"
+ "whilelt p8.h, x26, %x[width]\n"
"1:" // Width loop
"add x23, %x[in], XZR, LSL #3\n"
- "add x20, %x[in], x25, LSL #3\n"
- "mov x13, #0x0\n"
+ "add x20, %x[in], x27, LSL #3\n"
"ldr x22, [x23], #0x8\n"
+ "mov x12, #0x0\n"
"ldr x21, [x20], #0x8\n"
- "cbz x26, 3f\n"
+ "cbz x28, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0xe05b26c0 // ld1h { za0h.h[x13] }, p1/Z, [x22, x27, LSL #1]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe05906c0 // ld1h { za0h.h[x12] }, p1/Z, [x22, x25, LSL #1]\n"
"ldr x22, [x23], #0x8\n"
- ".inst 0xe05b22a8 // ld1h { za1h.h[x13] }, p0/Z, [x21, x27, LSL #1]\n"
- "add x13, x13, #0x2\n"
+ ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x28, LSL #1\n"
"ldr x21, [x20], #0x8\n"
- "cmp x13, x26, LSL #1\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- "sub x20, %x[width], x28\n"
+ "sub x20, %x[width], x26\n"
+ ".inst 0x25286580 // psel p0.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0xe05902c0 // ld1h { za0h.h[x12] }, p0/Z, [x22, x25, LSL #1]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ "cmp x20, x27\n"
+ ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
"mov x12, #0x0\n"
- "cmp x20, x25\n"
- ".inst 0xe05b26c0 // ld1h { za0h.h[x13] }, p1/Z, [x22, x27, LSL #1]\n"
- "csel x20, x20, x25, LT\n"
- ".inst 0xe05b22a8 // ld1h { za1h.h[x13] }, p0/Z, [x21, x27, LSL #1]\n"
+ "csel x20, x20, x27, LT\n"
"4:" // Stores: Loop
- ".inst 0x25287541 // psel p1.h, p13.h/Z, p10.h[w12]\n"
".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n"
- ".inst 0xe07f8700 // st1h { za0v.h[x12] }, p1/Z, [x24, XZR, LSL #1]\n"
- ".inst 0xe0798308 // st1h { za1v.h[x12] }, p0/Z, [x24, x25, LSL #1]\n"
+ ".inst 0xe07f8300 // st1h { za0v.h[x12] }, p0/Z, [x24, XZR, LSL #1]\n"
+ ".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n"
+ ".inst 0xe07b8308 // st1h { za1v.h[x12] }, p0/Z, [x24, x27, LSL #1]\n"
"add x12, x12, #0x1\n"
- "addvl x24, x24, #4\n"
"cmp x12, x20\n"
+ "addvl x24, x24, #4\n"
"blt 4b\n"
- "inch x28\n"
- "inch x27\n"
- "whilelt p10.h, x28, %x[width]\n"
- "whilelt p9.h, x28, %x[width]\n"
- "whilelt p8.h, x28, %x[width]\n"
+ "inch x26\n"
+ "whilelt p10.h, x26, %x[width]\n"
+ "whilelt p9.h, x26, %x[width]\n"
+ "whilelt p8.h, x26, %x[width]\n"
+ "inch x25\n"
"b.any 1b\n"
"mov %x[out], x24\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
index 68fabe3523..b310651525 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
@@ -32,251 +32,256 @@ void interleave_block<2, 1, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
+ "mov x22, %x[width]\n"
+ "incw x22\n"
+ "cntw x16\n"
+ "sub x22, x22, #0x1\n"
+ "udiv x22, x22, x16\n" // n_passes = ceildiv(width, VL<T>)
"mov x21, %x[width]\n"
- "mov x20, %x[width]\n"
- "incw x21\n"
- "cntw x17\n"
- "sub x21, x21, #0x1\n"
- "sub x16, x17, #0x1\n"
- "udiv x21, x21, x17\n" // n_passes = ceildiv(width, VL<T>)
- "ands x16, x20, x16\n"
- "sub x20, x21, #0x1\n"
- "sub x15, x17, #0x2\n"
- "mov x14, #0x0\n"
+ "sub x15, x16, #0x1\n"
+ "sub x20, x22, #0x1\n"
+ "ands x15, x21, x15\n"
+ "sub x14, x16, #0x2\n"
+ "mov x13, #0x0\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x17, LSL #3\n"
- "cntw x9, ALL, MUL #2\n"
- "ldr x28, [x11, #0x0]\n"
- "cntw x27, ALL, MUL #3\n"
+ "ldr x10, [x11, #0x0]\n"
+ "add x9, %x[in], x16, LSL #3\n"
+ "cntw x28, ALL, MUL #2\n"
+ "ldr x27, [x9, #0x0]\n"
+ "cntw x26, ALL, MUL #3\n"
"lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2
- "ldr x26, [x10, #0x0]\n"
- "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
- "csel x16, x16, x17, NE\n"
- "ldr x24, [x11, #0x8]\n"
- "ptrue p12.s\n"
- "whilelt p11.s, XZR, %x[height]\n"
- "ldr x21, [x10, #0x8]\n"
- "whilelt p10.s, x17, %x[height]\n"
+ "ldr x25, [x11, #0x8]\n"
+ "and x24, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "csel x15, x15, x16, NE\n"
+ "ldr x21, [x9, #0x8]\n"
+ "ptrue p13.s\n"
+ "whilelt p12.s, XZR, %x[height]\n"
+ "whilelt p11.s, x16, %x[height]\n"
"mov x23, %x[row_offset]\n"
"mov x22, %x[out]\n"
- "whilelt p9.s, x14, %x[width]\n"
- "whilelt p8.s, x14, %x[width]\n"
+ "whilelt p10.s, x13, %x[width]\n"
+ "whilelt p9.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
"add x11, x11, #0x10\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
"mov x12, #0x0\n"
- "cbz x15, 2f\n"
+ "cbz x14, 2f\n"
"1:" // K loop: Charge: Loop
- ".inst 0x25306163 // psel p3.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0x25306142 // psel p2.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
- ".inst 0x25706140 // psel p0.s, p8.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0970f80 // ld1w { za0h.s[x12] }, p3/Z, [x28, x23, LSL #2]\n"
- "ldr x28, [x11, #0x0]\n"
- ".inst 0xe0970b44 // ld1w { za1h.s[x12] }, p2/Z, [x26, x23, LSL #2]\n"
- "ldr x26, [x10, #0x0]\n"
- ".inst 0xe0970701 // ld1w { za0h.s[x12, #1] }, p1/Z, [x24, x23, LSL #2]\n"
- "ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
+ ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ "ldr x10, [x11, #0x0]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+ ".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
+ ".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
+ "ldr x27, [x9, #0x0]\n"
+ ".inst 0xe0970721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
+ "ldr x25, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
".inst 0xe09702a5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
"add x12, x12, #0x2\n"
- "ldr x21, [x10, #0x8]\n"
- "add x10, x10, #0x10\n"
- "cmp x12, x15\n"
+ "cmp x12, x14\n"
+ "ldr x21, [x9, #0x8]\n"
+ "add x9, x9, #0x10\n"
"blt 1b\n"
"2:" // K loop: Charge: End
- ".inst 0x25306163 // psel p3.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0x25306142 // psel p2.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
- ".inst 0x25706140 // psel p0.s, p8.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
+ ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+ ".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
+ ".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x17, LSL #3\n"
- ".inst 0xe0970f80 // ld1w { za0h.s[x12] }, p3/Z, [x28, x23, LSL #2]\n"
- "ldr x28, [x11, #0x0]\n"
- "incw x14\n"
- ".inst 0xe0970b44 // ld1w { za1h.s[x12] }, p2/Z, [x26, x23, LSL #2]\n"
- "ldr x26, [x10, #0x0]\n"
- ".inst 0xe0970701 // ld1w { za0h.s[x12, #1] }, p1/Z, [x24, x23, LSL #2]\n"
- "ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
+ "add x9, %x[in], x16, LSL #3\n"
+ ".inst 0xe0970721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
+ "ldr x10, [x11, #0x0]\n"
".inst 0xe09702a5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
- "ldr x21, [x10, #0x8]\n"
- "add x10, x10, #0x10\n"
+ "ldr x27, [x9, #0x0]\n"
"incw x23\n"
+ "incw x13\n"
+ "ldr x25, [x11, #0x8]\n"
+ "add x11, x11, #0x10\n"
+ "ldr x21, [x9, #0x8]\n"
+ "add x9, x9, #0x10\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
"3:" // K loop: Main loop
- "whilelt p8.s, x14, %x[width]\n"
- "mov x13, #0x0\n"
- "cbz x15, 5f\n"
+ "whilelt p9.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
+ "mov x12, #0x0\n"
+ "cbz x14, 5f\n"
"4:" // K loop: Main loop: First: Loop
- ".inst 0x25316160 // psel p0.s, p8.s/Z, p11.s[w13]\n"
- ".inst 0x25316142 // psel p2.s, p8.s/Z, p10.s[w13]\n"
- ".inst 0x25716161 // psel p1.s, p8.s/Z, p11.s[w13, #1]\n"
- ".inst 0x25716143 // psel p3.s, p8.s/Z, p10.s[w13, #1]\n"
- ".inst 0xe0972388 // ld1w { za2h.s[x13] }, p0/Z, [x28, x23, LSL #2]\n"
- ".inst 0x25317120 // psel p0.s, p12.s/Z, p9.s[w13]\n"
- "ldr x28, [x11, #0x0]\n"
- ".inst 0xe0972b4c // ld1w { za3h.s[x13] }, p2/Z, [x26, x23, LSL #2]\n"
- ".inst 0x25317122 // psel p2.s, p12.s/Z, p9.s[w13]\n"
- "ldr x26, [x10, #0x0]\n"
- ".inst 0xe0972709 // ld1w { za2h.s[x13, #1] }, p1/Z, [x24, x23, LSL #2]\n"
- ".inst 0x25717121 // psel p1.s, p12.s/Z, p9.s[w13, #1]\n"
- "ldr x24, [x11, #0x8]\n"
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
+ ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
+ ".inst 0xe0970548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ "ldr x10, [x11, #0x0]\n"
+ ".inst 0xe097036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+ ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+ ".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
+ "ldr x27, [x9, #0x0]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0970329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+ "ldr x25, [x11, #0x8]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0970aad // ld1w { za3h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0972ead // ld1w { za3h.s[x13, #1] }, p3/Z, [x21, x23, LSL #2]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bfa2c0 // st1w { za0v.s[x13] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25717120 // psel p0.s, p12.s/Z, p9.s[w13, #1]\n"
- ".inst 0xe0b1aac4 // st1w { za1v.s[x13] }, p2/Z, [x22, x17, LSL #2]\n"
- "add x10, x10, #0x10\n"
- ".inst 0xe0a9a6c1 // st1w { za0v.s[x13, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bba2c5 // st1w { za1v.s[x13, #1] }, p0/Z, [x22, x27, LSL #2]\n"
- "add x13, x13, #0x2\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0xe0ba82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x14\n"
"addvl x22, x22, #4\n"
- "cmp x13, x15\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
- ".inst 0x25316160 // psel p0.s, p8.s/Z, p11.s[w13]\n"
- ".inst 0x25316142 // psel p2.s, p8.s/Z, p10.s[w13]\n"
- ".inst 0x25716161 // psel p1.s, p8.s/Z, p11.s[w13, #1]\n"
- ".inst 0x25716143 // psel p3.s, p8.s/Z, p10.s[w13, #1]\n"
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
+ ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
+ ".inst 0xe0970548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ ".inst 0xe097036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x17, LSL #3\n"
- ".inst 0xe0972388 // ld1w { za2h.s[x13] }, p0/Z, [x28, x23, LSL #2]\n"
- ".inst 0x25317120 // psel p0.s, p12.s/Z, p9.s[w13]\n"
- "ldr x28, [x11, #0x0]\n"
- "mov x12, #0x0\n"
- ".inst 0xe0972b4c // ld1w { za3h.s[x13] }, p2/Z, [x26, x23, LSL #2]\n"
- ".inst 0x25317122 // psel p2.s, p12.s/Z, p9.s[w13]\n"
- "ldr x26, [x10, #0x0]\n"
- ".inst 0xe0972709 // ld1w { za2h.s[x13, #1] }, p1/Z, [x24, x23, LSL #2]\n"
- ".inst 0x25717121 // psel p1.s, p12.s/Z, p9.s[w13, #1]\n"
- "ldr x24, [x11, #0x8]\n"
+ "add x9, %x[in], x16, LSL #3\n"
+ "ldr x10, [x11, #0x0]\n"
+ ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+ ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
+ ".inst 0xe0970329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+ "ldr x27, [x9, #0x0]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe09706ad // ld1w { za3h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
+ "ldr x25, [x11, #0x8]\n"
+ ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+ "whilelt p10.s, x13, %x[width]\n"
+ "incw x13\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0972ead // ld1w { za3h.s[x13, #1] }, p3/Z, [x21, x23, LSL #2]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bfa2c0 // st1w { za0v.s[x13] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25717120 // psel p0.s, p12.s/Z, p9.s[w13, #1]\n"
- ".inst 0xe0b1aac4 // st1w { za1v.s[x13] }, p2/Z, [x22, x17, LSL #2]\n"
- "whilelt p9.s, x14, %x[width]\n"
- "incw x14\n"
- ".inst 0xe0a9a6c1 // st1w { za0v.s[x13, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- "add x10, x10, #0x10\n"
- "incw x23\n"
- ".inst 0xe0bba2c5 // st1w { za1v.s[x13, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0xe0ba82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
"addvl x22, x22, #4\n"
- "whilelt p8.s, x14, %x[width]\n"
- "cbz x15, 7f\n"
+ "incw x23\n"
+ "whilelt p9.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
+ "mov x12, #0x0\n"
+ "cbz x14, 7f\n"
"6:" // K loop: Main loop: Second: Loop
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0x25306142 // psel p2.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
- ".inst 0x25706143 // psel p3.s, p8.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0970380 // ld1w { za0h.s[x12] }, p0/Z, [x28, x23, LSL #2]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- "ldr x28, [x11, #0x0]\n"
- ".inst 0xe0970b44 // ld1w { za1h.s[x12] }, p2/Z, [x26, x23, LSL #2]\n"
- ".inst 0x25307122 // psel p2.s, p12.s/Z, p9.s[w12]\n"
- "ldr x26, [x10, #0x0]\n"
- ".inst 0xe0970701 // ld1w { za0h.s[x12, #1] }, p1/Z, [x24, x23, LSL #2]\n"
- ".inst 0x25707121 // psel p1.s, p12.s/Z, p9.s[w12, #1]\n"
- "ldr x24, [x11, #0x8]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ "ldr x10, [x11, #0x0]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+ ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+ ".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
+ "ldr x27, [x9, #0x0]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0970321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+ "ldr x25, [x11, #0x8]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0970aa5 // ld1w { za1h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0970ea5 // ld1w { za1h.s[x12, #1] }, p3/Z, [x21, x23, LSL #2]\n"
- "ldr x21, [x10, #0x8]\n"
- ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25707120 // psel p0.s, p12.s/Z, p9.s[w12, #1]\n"
- ".inst 0xe0b18acc // st1w { za3v.s[x12] }, p2/Z, [x22, x17, LSL #2]\n"
- "add x10, x10, #0x10\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0xe0ba82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
"add x12, x12, #0x2\n"
+ "cmp x12, x14\n"
"addvl x22, x22, #4\n"
- "cmp x12, x15\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0x25306142 // psel p2.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
- ".inst 0x25706143 // psel p3.s, p8.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
"mov x11, %x[in]\n"
- "add x10, %x[in], x17, LSL #3\n"
- ".inst 0xe0970380 // ld1w { za0h.s[x12] }, p0/Z, [x28, x23, LSL #2]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- "ldr x28, [x11, #0x0]\n"
- ".inst 0xe0970b44 // ld1w { za1h.s[x12] }, p2/Z, [x26, x23, LSL #2]\n"
- ".inst 0x25307122 // psel p2.s, p12.s/Z, p9.s[w12]\n"
- "ldr x26, [x10, #0x0]\n"
- ".inst 0xe0970701 // ld1w { za0h.s[x12, #1] }, p1/Z, [x24, x23, LSL #2]\n"
- ".inst 0x25707121 // psel p1.s, p12.s/Z, p9.s[w12, #1]\n"
- "ldr x24, [x11, #0x8]\n"
- "add x11, x11, #0x10\n"
- ".inst 0xe0970ea5 // ld1w { za1h.s[x12, #1] }, p3/Z, [x21, x23, LSL #2]\n"
- "ldr x21, [x10, #0x8]\n"
+ "add x9, %x[in], x16, LSL #3\n"
+ "ldr x10, [x11, #0x0]\n"
+ ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+ ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
+ ".inst 0xe0970321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+ "ldr x27, [x9, #0x0]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe09706a5 // ld1w { za1h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
+ "ldr x25, [x11, #0x8]\n"
+ ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
+ "ldr x21, [x9, #0x8]\n"
".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
- ".inst 0x25707120 // psel p0.s, p12.s/Z, p9.s[w12, #1]\n"
- ".inst 0xe0b18acc // st1w { za3v.s[x12] }, p2/Z, [x22, x17, LSL #2]\n"
- "whilelt p9.s, x14, %x[width]\n"
+ ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+ ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+ "whilelt p10.s, x13, %x[width]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xe0a986c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x9, LSL #2]\n"
- "add x10, x10, #0x10\n"
- "incw x14\n"
- ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0xe0ba82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
"addvl x22, x22, #4\n"
+ "incw x13\n"
"incw x23\n"
"bgt 3b\n"
"8:" // K loop: Tails
- "cbnz x25, 11f\n"
+ "cbnz x24, 11f\n"
"mov x11, %x[in]\n"
- "whilelt p8.s, x14, %x[width]\n"
+ "whilelt p9.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
- ".inst 0x25307123 // psel p3.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307122 // psel p2.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25306161 // psel p1.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf8ec0 // st1w { za0v.s[x12] }, p3/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0b18ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x17, LSL #2]\n"
- "addvl x22, x22, #2\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"ldr x21, [x11, #0x0]\n"
- "ldr x20, [x11, x17, LSL #0x3]\n"
- "add x11, x11, #0x8\n"
+ ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
+ ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
".inst 0xe09706a8 // ld1w { za2h.s[x12] }, p1/Z, [x21, x23, LSL #2]\n"
+ "add x11, x11, #0x8\n"
+ "addvl x22, x22, #2\n"
".inst 0xe097028c // ld1w { za3h.s[x12] }, p0/Z, [x20, x23, LSL #2]\n"
"add x12, x12, #0x1\n"
- "cmp x12, x17\n"
+ "cmp x12, x16\n"
"blt 9b\n"
- "whilelt p9.s, x14, %x[width]\n"
- "whilelt p8.s, x14, %x[width]\n"
+ "whilelt p10.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
- ".inst 0x25307121 // psel p1.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0b182cc // st1w { za3v.s[x12] }, p0/Z, [x22, x17, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
+ "cmp x12, x15\n"
"addvl x22, x22, #2\n"
- "cmp x12, x16\n"
"blt 10b\n"
- "whilelt p8.s, x14, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
- ".inst 0x25307121 // psel p1.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0x25307120 // psel p0.s, p12.s/Z, p9.s[w12]\n"
- ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
- ".inst 0xe0b182c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x17, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
+ "cmp x12, x15\n"
"addvl x22, x22, #2\n"
- "cmp x12, x16\n"
"blt 12b\n"
"13:" // K loop: End
"mov %x[out], x22\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
index f5c756eba6..6903945536 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
@@ -32,93 +32,93 @@ void interleave_block<4, 2, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x17, #0x0\n"
- "mov x16, %x[row_offset]\n"
+ "cntw x16\n"
"cntw x15\n"
- "cntw x14\n"
- "cntw x11, ALL, MUL #2\n"
- "cntw x10, ALL, MUL #3\n"
- "cmp %x[height], x15\n"
- "cnth x9\n"
- "csel x15, %x[height], x15, LT\n"
+ "cntw x14, ALL, MUL #2\n"
+ "cntw x13, ALL, MUL #3\n"
+ "cmp %x[height], x16\n"
+ "csel x16, %x[height], x16, LT\n"
"whilelt p11.h, XZR, %x[height]\n"
- "whilelt p10.h, x14, %x[height]\n"
- "whilelt p9.h, x11, %x[height]\n"
- "whilelt p8.h, x10, %x[height]\n"
+ "whilelt p10.h, x15, %x[height]\n"
+ "whilelt p9.h, x14, %x[height]\n"
+ "whilelt p8.h, x13, %x[height]\n"
+ "mov x11, #0x0\n"
+ "cnth x10\n"
"ptrue p13.s\n"
- "sub x15, x15, #0x1\n"
+ "sub x16, x16, #0x1\n"
"zip1 p12.h, p11.h, p9.h\n"
"zip1 p11.h, p10.h, p8.h\n"
+ "mov x9, %x[row_offset]\n"
"mov x28, %x[out]\n"
- "whilelt p10.h, x17, %x[width]\n"
- "whilelt p9.h, x17, %x[width]\n"
- "whilelt p8.h, x17, %x[width]\n"
+ "whilelt p10.h, x11, %x[width]\n"
+ "whilelt p9.h, x11, %x[width]\n"
+ "whilelt p8.h, x11, %x[width]\n"
"1:" // Width loop
"add x27, %x[in], XZR, LSL #3\n"
- "add x26, %x[in], x14, LSL #3\n"
- "add x25, %x[in], x11, LSL #3\n"
- "add x20, %x[in], x10, LSL #3\n"
- "ldr x24, [x27], #0x8\n"
- "mov x13, #0x0\n"
+ "add x26, %x[in], x15, LSL #3\n"
+ "ldr x25, [x27], #0x8\n"
+ "add x24, %x[in], x14, LSL #3\n"
+ "add x20, %x[in], x13, LSL #3\n"
"ldr x23, [x26], #0x8\n"
- "ldr x22, [x25], #0x8\n"
+ "mov x12, #0x0\n"
+ "ldr x22, [x24], #0x8\n"
"ldr x21, [x20], #0x8\n"
- "cbz x15, 3f\n"
+ "cbz x16, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25296580 // psel p0.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296162 // psel p2.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
- ".inst 0xe0502300 // ld1h { za0h.h[x13] }, p0/Z, [x24, x16, LSL #1]\n"
- ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
- "ldr x24, [x27], #0x8\n"
- ".inst 0xe0502ae8 // ld1h { za1h.h[x13] }, p2/Z, [x23, x16, LSL #1]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe0490720 // ld1h { za0h.h[x12] }, p1/Z, [x25, x9, LSL #1]\n"
+ "ldr x25, [x27], #0x8\n"
+ ".inst 0xe04902e8 // ld1h { za1h.h[x12] }, p0/Z, [x23, x9, LSL #1]\n"
+ ".inst 0x25386581 // psel p1.h, p9.h/Z, p12.h[w12, #1]\n"
+ ".inst 0x25386160 // psel p0.h, p8.h/Z, p11.h[w12, #1]\n"
"ldr x23, [x26], #0x8\n"
- ".inst 0xe05026c1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x22, x16, LSL #1]\n"
- "ldr x22, [x25], #0x8\n"
- ".inst 0xe05022a9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x21, x16, LSL #1]\n"
- "add x13, x13, #0x2\n"
+ ".inst 0xe04906c1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x9, LSL #1]\n"
+ "ldr x22, [x24], #0x8\n"
+ ".inst 0xe04902a9 // ld1h { za1h.h[x12, #1] }, p0/Z, [x21, x9, LSL #1]\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x16, LSL #1\n"
"ldr x21, [x20], #0x8\n"
- "cmp x13, x15, LSL #1\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25296580 // psel p0.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296162 // psel p2.h, p8.h/Z, p11.h[w13]\n"
- "sub x20, %x[width], x17\n"
- ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
- "cmp x20, x9\n"
- "mov x12, #0x0\n"
- ".inst 0xe0502300 // ld1h { za0h.h[x13] }, p0/Z, [x24, x16, LSL #1]\n"
- ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
- "csel x20, x20, x9, LT\n"
- ".inst 0xe0502ae8 // ld1h { za1h.h[x13] }, p2/Z, [x23, x16, LSL #1]\n"
+ ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
+ ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
+ ".inst 0xe0490720 // ld1h { za0h.h[x12] }, p1/Z, [x25, x9, LSL #1]\n"
+ "sub x20, %x[width], x11\n"
+ ".inst 0xe04902e8 // ld1h { za1h.h[x12] }, p0/Z, [x23, x9, LSL #1]\n"
+ "cmp x20, x10\n"
+ "csel x20, x20, x10, LT\n"
+ ".inst 0x25386580 // psel p0.h, p9.h/Z, p12.h[w12, #1]\n"
+ ".inst 0xe04902c1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x9, LSL #1]\n"
+ ".inst 0x25386160 // psel p0.h, p8.h/Z, p11.h[w12, #1]\n"
"add x20, x20, #0x1\n"
- ".inst 0xe05026c1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x22, x16, LSL #1]\n"
+ ".inst 0xe04902a9 // ld1h { za1h.h[x12, #1] }, p0/Z, [x21, x9, LSL #1]\n"
+ "mov x12, #0x0\n"
"lsr x20, x20, #0x1\n"
- ".inst 0xe05022a9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x21, x16, LSL #1]\n"
"4:" // Stores: Loop
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0ae8b84 // st1w { za1v.s[x12] }, p2/Z, [x28, x14, LSL #2]\n"
- ".inst 0xe0ab8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x11, LSL #2]\n"
- ".inst 0xe0aa838c // st1w { za3v.s[x12] }, p0/Z, [x28, x10, LSL #2]\n"
+ ".inst 0xe0af8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n"
+ ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
+ ".inst 0xe0ae8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n"
+ ".inst 0xe0ad838c // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x28, x28, #4\n"
"cmp x12, x20\n"
+ "addvl x28, x28, #4\n"
"blt 4b\n"
- "inch x17\n"
- "inch x16\n"
- "whilelt p10.h, x17, %x[width]\n"
- "whilelt p9.h, x17, %x[width]\n"
- "whilelt p8.h, x17, %x[width]\n"
+ "inch x11\n"
+ "whilelt p10.h, x11, %x[width]\n"
+ "whilelt p9.h, x11, %x[width]\n"
+ "whilelt p8.h, x11, %x[width]\n"
+ "inch x9\n"
"b.any 1b\n"
"mov %x[out], x28\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp
index 9e0ab463be..b1ba9a4fe7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(__ARM_FEATURE_SVE)
template <>
void interleave_block<4, 2, VLType::SME, false>(
@@ -65,36 +65,36 @@ void interleave_block<4, 2, VLType::SME, false>(
"ldr x21, [x20], #0x8\n"
"cbz x15, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25296580 // psel p0.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296162 // psel p2.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
- ".inst 0xe0502300 // ld1h { za0h.h[x13] }, p0/Z, [x24, x16, LSL #1]\n"
- ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ ".inst 0x25296582 // psel p2.h, p9.h/Z, p12.h[w13]\n"
+ ".inst 0x25296161 // psel p1.h, p8.h/Z, p11.h[w13]\n"
+ ".inst 0x25396580 // psel p0.h, p9.h/Z, p12.h[w13, #1]\n"
+ ".inst 0xe0502b00 // ld1h { za0h.h[x13] }, p2/Z, [x24, x16, LSL #1]\n"
+ ".inst 0x25396162 // psel p2.h, p8.h/Z, p11.h[w13, #1]\n"
"ldr x24, [x27], #0x8\n"
- ".inst 0xe0502ae8 // ld1h { za1h.h[x13] }, p2/Z, [x23, x16, LSL #1]\n"
+ ".inst 0xe05026e8 // ld1h { za1h.h[x13] }, p1/Z, [x23, x16, LSL #1]\n"
"ldr x23, [x26], #0x8\n"
- ".inst 0xe05026c1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x22, x16, LSL #1]\n"
+ ".inst 0xe05022c1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x22, x16, LSL #1]\n"
"ldr x22, [x25], #0x8\n"
- ".inst 0xe05022a9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x21, x16, LSL #1]\n"
+ ".inst 0xe0502aa9 // ld1h { za1h.h[x13, #1] }, p2/Z, [x21, x16, LSL #1]\n"
"add x13, x13, #0x2\n"
"ldr x21, [x20], #0x8\n"
"cmp x13, x15, LSL #1\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25296580 // psel p0.h, p9.h/Z, p12.h[w13]\n"
- ".inst 0x25296162 // psel p2.h, p8.h/Z, p11.h[w13]\n"
+ ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
+ ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
"sub x20, %x[width], x17\n"
- ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+ ".inst 0x25396582 // psel p2.h, p9.h/Z, p12.h[w13, #1]\n"
"cmp x20, x9\n"
"mov x12, #0x0\n"
- ".inst 0xe0502300 // ld1h { za0h.h[x13] }, p0/Z, [x24, x16, LSL #1]\n"
- ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+ ".inst 0xe0502700 // ld1h { za0h.h[x13] }, p1/Z, [x24, x16, LSL #1]\n"
+ ".inst 0xe05022e8 // ld1h { za1h.h[x13] }, p0/Z, [x23, x16, LSL #1]\n"
+ ".inst 0x25396161 // psel p1.h, p8.h/Z, p11.h[w13, #1]\n"
"csel x20, x20, x9, LT\n"
- ".inst 0xe0502ae8 // ld1h { za1h.h[x13] }, p2/Z, [x23, x16, LSL #1]\n"
"add x20, x20, #0x1\n"
- ".inst 0xe05026c1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x22, x16, LSL #1]\n"
+ ".inst 0xe0502ac1 // ld1h { za0h.h[x13, #1] }, p2/Z, [x22, x16, LSL #1]\n"
"lsr x20, x20, #0x1\n"
- ".inst 0xe05022a9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x21, x16, LSL #1]\n"
+ ".inst 0xe05026a9 // ld1h { za1h.h[x13, #1] }, p1/Z, [x21, x16, LSL #1]\n"
"4:" // Stores: Loop
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
@@ -122,4 +122,4 @@ void interleave_block<4, 2, VLType::SME, false>(
);
}
-#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
index b0b3aa85c1..9a5521aa8a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
@@ -32,92 +32,92 @@ void interleave_block<4, 4, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x17, #0x0\n"
- "mov x16, %x[row_offset]\n"
+ "cntw x16\n"
"cntw x15\n"
- "cntw x14\n"
- "cntw x11, ALL, MUL #2\n"
- "cntw x10, ALL, MUL #3\n"
- "cmp %x[height], x15\n"
- "cntb x9\n"
- "csel x15, %x[height], x15, LT\n"
+ "cntw x14, ALL, MUL #2\n"
+ "cntw x13, ALL, MUL #3\n"
+ "cmp %x[height], x16\n"
+ "csel x16, %x[height], x16, LT\n"
"whilelt p12.b, XZR, %x[height]\n"
- "whilelt p10.b, x14, %x[height]\n"
- "whilelt p9.b, x11, %x[height]\n"
- "whilelt p8.b, x10, %x[height]\n"
+ "whilelt p10.b, x15, %x[height]\n"
+ "whilelt p9.b, x14, %x[height]\n"
+ "whilelt p8.b, x13, %x[height]\n"
"zip1 p12.b, p12.b, p9.b\n"
"zip1 p10.b, p10.b, p8.b\n"
+ "mov x11, #0x0\n"
+ "cntb x10\n"
"ptrue p11.s\n"
- "sub x15, x15, #0x1\n"
+ "sub x16, x16, #0x1\n"
"zip1 p10.b, p12.b, p10.b\n"
+ "mov x9, %x[row_offset]\n"
"mov x28, %x[out]\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p9.b, x11, %x[width]\n"
+ "whilelt p8.b, x11, %x[width]\n"
"1:" // Width loop
"add x27, %x[in], XZR, LSL #3\n"
- "add x26, %x[in], x14, LSL #3\n"
- "add x25, %x[in], x11, LSL #3\n"
- "add x20, %x[in], x10, LSL #3\n"
- "ldr x24, [x27], #0x8\n"
- "mov x13, #0x0\n"
- "ldr x23, [x26], #0x8\n"
- "ldr x22, [x25], #0x8\n"
- "ldr x21, [x20], #0x8\n"
- "cbz x15, 3f\n"
+ "add x26, %x[in], x15, LSL #3\n"
+ "ldr x25, [x27], #0x8\n"
+ "add x24, %x[in], x14, LSL #3\n"
+ "add x23, %x[in], x13, LSL #3\n"
+ "ldr x20, [x26], #0x8\n"
+ "mov x12, #0x0\n"
+ "ldr x22, [x24], #0x8\n"
+ "ldr x21, [x23], #0x8\n"
+ "cbz x16, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0102300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x16]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "ldr x24, [x27], #0x8\n"
- ".inst 0xe0102ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x16]\n"
- "ldr x23, [x26], #0x8\n"
- ".inst 0xe01026c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x16]\n"
- "ldr x22, [x25], #0x8\n"
- ".inst 0xe01022a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x16]\n"
- "add x13, x13, #0x4\n"
- "ldr x21, [x20], #0x8\n"
- "cmp x13, x15, LSL #2\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x25, [x27], #0x8\n"
+ ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+ ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+ "ldr x20, [x26], #0x8\n"
+ ".inst 0xe00906c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x9]\n"
+ "ldr x22, [x24], #0x8\n"
+ ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x16, LSL #2\n"
+ "ldr x21, [x23], #0x8\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- "sub x20, %x[width], x17\n"
- "cmp x20, x9\n"
- "mov x12, #0x0\n"
- ".inst 0xe0102300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x16]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "csel x20, x20, x9, LT\n"
- ".inst 0xe0102ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x16]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+ ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
+ "sub x20, %x[width], x11\n"
+ ".inst 0xe00902c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x9]\n"
+ "cmp x20, x10\n"
+ "csel x20, x20, x10, LT\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
"add x20, x20, #0x3\n"
- ".inst 0xe01026c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x16]\n"
+ ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+ "mov x12, #0x0\n"
"lsr x20, x20, #0x2\n"
- ".inst 0xe01022a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x16]\n"
"4:" // Stores: Loop
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xe0ae8b84 // st1w { za1v.s[x12] }, p2/Z, [x28, x14, LSL #2]\n"
- ".inst 0xe0ab8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x11, LSL #2]\n"
- ".inst 0xe0aa838c // st1w { za3v.s[x12] }, p0/Z, [x28, x10, LSL #2]\n"
+ ".inst 0xe0af8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0ae8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n"
+ ".inst 0xe0ad838c // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x28, x28, #4\n"
"cmp x12, x20\n"
+ "addvl x28, x28, #4\n"
"blt 4b\n"
- "incb x17\n"
- "incb x16\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "incb x11\n"
+ "whilelt p9.b, x11, %x[width]\n"
+ "whilelt p8.b, x11, %x[width]\n"
+ "incb x9\n"
"b.any 1b\n"
"mov %x[out], x28\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
index a4696816e9..22ec1011cc 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
@@ -32,118 +32,118 @@ void interleave_block<4, 4, VLType::SME, true>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x17, %x[row_offset]\n"
- "mov x16, %x[out]\n"
+ "cntw x16\n"
"cntw x15\n"
- "cntw x14\n"
"mov z24.b, #0x1\n"
+ "cntw x14, ALL, MUL #2\n"
+ "cntw x13, ALL, MUL #3\n"
"mov z23.s, #0x0\n"
- "cntw x11, ALL, MUL #2\n"
- "cntw x10, ALL, MUL #3\n"
"mov z22.s, #0x0\n"
+ "cmp %x[height], x16\n"
+ "csel x16, %x[height], x16, LT\n"
"mov z21.s, #0x0\n"
- "cmp %x[height], x15\n"
- "ptrue p3.b\n"
"mov z20.s, #0x0\n"
- "csel x15, %x[height], x15, LT\n"
"whilelt p12.b, XZR, %x[height]\n"
- "whilelt p10.b, x14, %x[height]\n"
- "whilelt p9.b, x11, %x[height]\n"
- "whilelt p8.b, x10, %x[height]\n"
+ "whilelt p10.b, x15, %x[height]\n"
+ "whilelt p9.b, x14, %x[height]\n"
+ "whilelt p8.b, x13, %x[height]\n"
"zip1 p12.b, p12.b, p9.b\n"
"zip1 p10.b, p10.b, p8.b\n"
- "cntb x9\n"
+ "ptrue p2.b\n"
+ "cntb x11\n"
"ptrue p11.s\n"
- "sub x15, x15, #0x1\n"
+ "sub x16, x16, #0x1\n"
"zip1 p10.b, p12.b, p10.b\n"
+ "mov x10, %x[row_offset]\n"
+ "mov x9, %x[out]\n"
"cbnz %x[first], 1f\n"
- "addvl x16, x16, #-4\n"
- "ld1w { z23.s }, p3/Z, [x16]\n"
- "ld1w { z22.s }, p3/Z, [x16, #1, MUL VL]\n"
- "ld1w { z21.s }, p3/Z, [x16, #2, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "addvl x9, x9, #-4\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "ld1w { z22.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #3, MUL VL]\n"
"1:" // Initialise row sums: End
"mov x28, #0x0\n"
"whilelt p9.b, x28, %x[width]\n"
"whilelt p8.b, x28, %x[width]\n"
"2:" // Width loop
"add x27, %x[in], XZR, LSL #3\n"
- "add x26, %x[in], x14, LSL #3\n"
- "add x25, %x[in], x11, LSL #3\n"
- "add x20, %x[in], x10, LSL #3\n"
- "ldr x24, [x27], #0x8\n"
- "mov x13, #0x0\n"
- "ldr x23, [x26], #0x8\n"
- "ldr x22, [x25], #0x8\n"
- "ldr x21, [x20], #0x8\n"
- "cbz x15, 4f\n"
+ "add x26, %x[in], x15, LSL #3\n"
+ "ldr x25, [x27], #0x8\n"
+ "add x24, %x[in], x14, LSL #3\n"
+ "add x23, %x[in], x13, LSL #3\n"
+ "ldr x20, [x26], #0x8\n"
+ "mov x12, #0x0\n"
+ "ldr x22, [x24], #0x8\n"
+ "ldr x21, [x23], #0x8\n"
+ "cbz x16, 4f\n"
"3:" // Loads: Loop
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0112300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x17]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "ldr x24, [x27], #0x8\n"
- ".inst 0xe0112ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x17]\n"
- "ldr x23, [x26], #0x8\n"
- ".inst 0xe01126c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x17]\n"
- "ldr x22, [x25], #0x8\n"
- ".inst 0xe01122a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x17]\n"
- "add x13, x13, #0x4\n"
- "ldr x21, [x20], #0x8\n"
- "cmp x13, x15, LSL #2\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x25, [x27], #0x8\n"
+ ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+ ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+ "ldr x20, [x26], #0x8\n"
+ ".inst 0xe00a06c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x10]\n"
+ "ldr x22, [x24], #0x8\n"
+ ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x16, LSL #2\n"
+ "ldr x21, [x23], #0x8\n"
"blt 3b\n"
"4:" // Loads: Tail
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+ ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
"sub x20, %x[width], x28\n"
- "cmp x20, x9\n"
- "mov x12, #0x0\n"
- ".inst 0xe0112300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x17]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "csel x20, x20, x9, LT\n"
- ".inst 0xe0112ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x17]\n"
+ ".inst 0xe00a02c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x10]\n"
+ "cmp x20, x11\n"
+ "csel x20, x20, x11, LT\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
"add x20, x20, #0x3\n"
- ".inst 0xe01126c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x17]\n"
+ ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+ "mov x12, #0x0\n"
"lsr x20, x20, #0x2\n"
- ".inst 0xe01122a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x17]\n"
"5:" // Stores: Loop
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828c11 // mova z17.s, p3/M, za0v.s[x12]\n"
+ ".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n"
+ ".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828c90 // mova z16.s, p3/M, za1v.s[x12]\n"
- "sdot z23.s, z17.b, z24.b\n"
- ".inst 0xc0828d13 // mova z19.s, p3/M, za2v.s[x12]\n"
- "sdot z22.s, z16.b, z24.b\n"
- ".inst 0xe0bf8200 // st1w { za0v.s[x12] }, p0/Z, [x16, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828d92 // mova z18.s, p3/M, za3v.s[x12]\n"
- "sdot z21.s, z19.b, z24.b\n"
- ".inst 0xe0ae8a04 // st1w { za1v.s[x12] }, p2/Z, [x16, x14, LSL #2]\n"
- "sdot z20.s, z18.b, z24.b\n"
- ".inst 0xe0ab8608 // st1w { za2v.s[x12] }, p1/Z, [x16, x11, LSL #2]\n"
- ".inst 0xe0aa820c // st1w { za3v.s[x12] }, p0/Z, [x16, x10, LSL #2]\n"
+ ".inst 0xc0828893 // mova z19.s, p2/M, za1v.s[x12]\n"
+ ".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
+ ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
+ "sdot z23.s, z17.b, z24.b\n"
+ ".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
+ ".inst 0xc0828992 // mova z18.s, p2/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
- "addvl x16, x16, #4\n"
"cmp x12, x20\n"
+ "sdot z22.s, z19.b, z24.b\n"
+ "sdot z21.s, z16.b, z24.b\n"
+ "addvl x9, x9, #4\n"
+ "sdot z20.s, z18.b, z24.b\n"
"blt 5b\n"
"incb x28\n"
- "incb x17\n"
"whilelt p9.b, x28, %x[width]\n"
"whilelt p8.b, x28, %x[width]\n"
+ "incb x10\n"
"b.any 2b\n"
- "st1w { z23.s }, p3, [x16]\n"
- "st1w { z22.s }, p3, [x16, #1, MUL VL]\n"
- "st1w { z21.s }, p3, [x16, #2, MUL VL]\n"
- "st1w { z20.s }, p3, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "st1w { z23.s }, p2, [x9]\n"
+ "st1w { z22.s }, p2, [x9, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z20.s }, p2, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "mov %x[out], x9\n"
".inst 0xd503467f // SMSTOP\n"
- "mov %x[out], x16\n"
: [out] "+&r" (out)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
index df77398acc..a99c2ea91b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
@@ -32,92 +32,92 @@ void interleave_block<4, 4, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x17, #0x0\n"
- "mov x16, %x[row_offset]\n"
+ "cntw x16\n"
"cntw x15\n"
- "cntw x14\n"
- "cntw x11, ALL, MUL #2\n"
- "cntw x10, ALL, MUL #3\n"
- "cmp %x[height], x15\n"
- "cntb x9\n"
- "csel x15, %x[height], x15, LT\n"
+ "cntw x14, ALL, MUL #2\n"
+ "cntw x13, ALL, MUL #3\n"
+ "cmp %x[height], x16\n"
+ "csel x16, %x[height], x16, LT\n"
"whilelt p12.b, XZR, %x[height]\n"
- "whilelt p10.b, x14, %x[height]\n"
- "whilelt p9.b, x11, %x[height]\n"
- "whilelt p8.b, x10, %x[height]\n"
+ "whilelt p10.b, x15, %x[height]\n"
+ "whilelt p9.b, x14, %x[height]\n"
+ "whilelt p8.b, x13, %x[height]\n"
"zip1 p12.b, p12.b, p9.b\n"
"zip1 p10.b, p10.b, p8.b\n"
+ "mov x11, #0x0\n"
+ "cntb x10\n"
"ptrue p11.s\n"
- "sub x15, x15, #0x1\n"
+ "sub x16, x16, #0x1\n"
"zip1 p10.b, p12.b, p10.b\n"
+ "mov x9, %x[row_offset]\n"
"mov x28, %x[out]\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "whilelt p9.b, x11, %x[width]\n"
+ "whilelt p8.b, x11, %x[width]\n"
"1:" // Width loop
"add x27, %x[in], XZR, LSL #3\n"
- "add x26, %x[in], x14, LSL #3\n"
- "add x25, %x[in], x11, LSL #3\n"
- "add x20, %x[in], x10, LSL #3\n"
- "ldr x24, [x27], #0x8\n"
- "mov x13, #0x0\n"
- "ldr x23, [x26], #0x8\n"
- "ldr x22, [x25], #0x8\n"
- "ldr x21, [x20], #0x8\n"
- "cbz x15, 3f\n"
+ "add x26, %x[in], x15, LSL #3\n"
+ "ldr x25, [x27], #0x8\n"
+ "add x24, %x[in], x14, LSL #3\n"
+ "add x23, %x[in], x13, LSL #3\n"
+ "ldr x20, [x26], #0x8\n"
+ "mov x12, #0x0\n"
+ "ldr x22, [x24], #0x8\n"
+ "ldr x21, [x23], #0x8\n"
+ "cbz x16, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0102300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x16]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "ldr x24, [x27], #0x8\n"
- ".inst 0xe0102ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x16]\n"
- "ldr x23, [x26], #0x8\n"
- ".inst 0xe01026c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x16]\n"
- "ldr x22, [x25], #0x8\n"
- ".inst 0xe01022a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x16]\n"
- "add x13, x13, #0x4\n"
- "ldr x21, [x20], #0x8\n"
- "cmp x13, x15, LSL #2\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x25, [x27], #0x8\n"
+ ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+ ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+ "ldr x20, [x26], #0x8\n"
+ ".inst 0xe00906c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x9]\n"
+ "ldr x22, [x24], #0x8\n"
+ ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x16, LSL #2\n"
+ "ldr x21, [x23], #0x8\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- "sub x20, %x[width], x17\n"
- "cmp x20, x9\n"
- "mov x12, #0x0\n"
- ".inst 0xe0102300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x16]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "csel x20, x20, x9, LT\n"
- ".inst 0xe0102ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x16]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+ ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
+ "sub x20, %x[width], x11\n"
+ ".inst 0xe00902c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x9]\n"
+ "cmp x20, x10\n"
+ "csel x20, x20, x10, LT\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
"add x20, x20, #0x3\n"
- ".inst 0xe01026c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x16]\n"
+ ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+ "mov x12, #0x0\n"
"lsr x20, x20, #0x2\n"
- ".inst 0xe01022a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x16]\n"
"4:" // Stores: Loop
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xe0ae8b84 // st1w { za1v.s[x12] }, p2/Z, [x28, x14, LSL #2]\n"
- ".inst 0xe0ab8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x11, LSL #2]\n"
- ".inst 0xe0aa838c // st1w { za3v.s[x12] }, p0/Z, [x28, x10, LSL #2]\n"
+ ".inst 0xe0af8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n"
+ ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xe0ae8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n"
+ ".inst 0xe0ad838c // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x28, x28, #4\n"
"cmp x12, x20\n"
+ "addvl x28, x28, #4\n"
"blt 4b\n"
- "incb x17\n"
- "incb x16\n"
- "whilelt p9.b, x17, %x[width]\n"
- "whilelt p8.b, x17, %x[width]\n"
+ "incb x11\n"
+ "whilelt p9.b, x11, %x[width]\n"
+ "whilelt p8.b, x11, %x[width]\n"
+ "incb x9\n"
"b.any 1b\n"
"mov %x[out], x28\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
index 14ab3f476b..0a826d38f2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
@@ -32,118 +32,118 @@ void interleave_block<4, 4, VLType::SME, true>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x17, %x[row_offset]\n"
- "mov x16, %x[out]\n"
+ "cntw x16\n"
"cntw x15\n"
- "cntw x14\n"
"mov z24.b, #0x1\n"
+ "cntw x14, ALL, MUL #2\n"
+ "cntw x13, ALL, MUL #3\n"
"mov z23.s, #0x0\n"
- "cntw x11, ALL, MUL #2\n"
- "cntw x10, ALL, MUL #3\n"
"mov z22.s, #0x0\n"
+ "cmp %x[height], x16\n"
+ "csel x16, %x[height], x16, LT\n"
"mov z21.s, #0x0\n"
- "cmp %x[height], x15\n"
- "ptrue p3.b\n"
"mov z20.s, #0x0\n"
- "csel x15, %x[height], x15, LT\n"
"whilelt p12.b, XZR, %x[height]\n"
- "whilelt p10.b, x14, %x[height]\n"
- "whilelt p9.b, x11, %x[height]\n"
- "whilelt p8.b, x10, %x[height]\n"
+ "whilelt p10.b, x15, %x[height]\n"
+ "whilelt p9.b, x14, %x[height]\n"
+ "whilelt p8.b, x13, %x[height]\n"
"zip1 p12.b, p12.b, p9.b\n"
"zip1 p10.b, p10.b, p8.b\n"
- "cntb x9\n"
+ "ptrue p2.b\n"
+ "cntb x11\n"
"ptrue p11.s\n"
- "sub x15, x15, #0x1\n"
+ "sub x16, x16, #0x1\n"
"zip1 p10.b, p12.b, p10.b\n"
+ "mov x10, %x[row_offset]\n"
+ "mov x9, %x[out]\n"
"cbnz %x[first], 1f\n"
- "addvl x16, x16, #-4\n"
- "ld1w { z23.s }, p3/Z, [x16]\n"
- "ld1w { z22.s }, p3/Z, [x16, #1, MUL VL]\n"
- "ld1w { z21.s }, p3/Z, [x16, #2, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "addvl x9, x9, #-4\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "ld1w { z22.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #3, MUL VL]\n"
"1:" // Initialise row sums: End
"mov x28, #0x0\n"
"whilelt p9.b, x28, %x[width]\n"
"whilelt p8.b, x28, %x[width]\n"
"2:" // Width loop
"add x27, %x[in], XZR, LSL #3\n"
- "add x26, %x[in], x14, LSL #3\n"
- "add x25, %x[in], x11, LSL #3\n"
- "add x20, %x[in], x10, LSL #3\n"
- "ldr x24, [x27], #0x8\n"
- "mov x13, #0x0\n"
- "ldr x23, [x26], #0x8\n"
- "ldr x22, [x25], #0x8\n"
- "ldr x21, [x20], #0x8\n"
- "cbz x15, 4f\n"
+ "add x26, %x[in], x15, LSL #3\n"
+ "ldr x25, [x27], #0x8\n"
+ "add x24, %x[in], x14, LSL #3\n"
+ "add x23, %x[in], x13, LSL #3\n"
+ "ldr x20, [x26], #0x8\n"
+ "mov x12, #0x0\n"
+ "ldr x22, [x24], #0x8\n"
+ "ldr x21, [x23], #0x8\n"
+ "cbz x16, 4f\n"
"3:" // Loads: Loop
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0112300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x17]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "ldr x24, [x27], #0x8\n"
- ".inst 0xe0112ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x17]\n"
- "ldr x23, [x26], #0x8\n"
- ".inst 0xe01126c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x17]\n"
- "ldr x22, [x25], #0x8\n"
- ".inst 0xe01122a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x17]\n"
- "add x13, x13, #0x4\n"
- "ldr x21, [x20], #0x8\n"
- "cmp x13, x15, LSL #2\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ "ldr x25, [x27], #0x8\n"
+ ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+ ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+ "ldr x20, [x26], #0x8\n"
+ ".inst 0xe00a06c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x10]\n"
+ "ldr x22, [x24], #0x8\n"
+ ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x16, LSL #2\n"
+ "ldr x21, [x23], #0x8\n"
"blt 3b\n"
"4:" // Loads: Tail
- ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n"
- ".inst 0x252d6142 // psel p2.b, p8.b/Z, p10.b[w13, #1]\n"
- ".inst 0x25356141 // psel p1.b, p8.b/Z, p10.b[w13, #2]\n"
+ ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n"
+ ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+ ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+ ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+ ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
"sub x20, %x[width], x28\n"
- "cmp x20, x9\n"
- "mov x12, #0x0\n"
- ".inst 0xe0112300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x17]\n"
- ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- "csel x20, x20, x9, LT\n"
- ".inst 0xe0112ae1 // ld1b { za0h.b[x13, #1] }, p2/Z, [x23, x17]\n"
+ ".inst 0xe00a02c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x10]\n"
+ "cmp x20, x11\n"
+ "csel x20, x20, x11, LT\n"
+ ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
"add x20, x20, #0x3\n"
- ".inst 0xe01126c2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x22, x17]\n"
+ ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+ "mov x12, #0x0\n"
"lsr x20, x20, #0x2\n"
- ".inst 0xe01122a3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x21, x17]\n"
"5:" // Stores: Loop
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828c13 // mova z19.s, p3/M, za0v.s[x12]\n"
+ ".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
+ ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
+ ".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828c92 // mova z18.s, p3/M, za1v.s[x12]\n"
- "udot z23.s, z19.b, z24.b\n"
- ".inst 0xc0828d11 // mova z17.s, p3/M, za2v.s[x12]\n"
- "udot z22.s, z18.b, z24.b\n"
- ".inst 0xe0bf8200 // st1w { za0v.s[x12] }, p0/Z, [x16, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828d90 // mova z16.s, p3/M, za3v.s[x12]\n"
- "udot z21.s, z17.b, z24.b\n"
- ".inst 0xe0ae8a04 // st1w { za1v.s[x12] }, p2/Z, [x16, x14, LSL #2]\n"
- "udot z20.s, z16.b, z24.b\n"
- ".inst 0xe0ab8608 // st1w { za2v.s[x12] }, p1/Z, [x16, x11, LSL #2]\n"
- ".inst 0xe0aa820c // st1w { za3v.s[x12] }, p0/Z, [x16, x10, LSL #2]\n"
+ ".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n"
+ ".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
+ ".inst 0xc0828913 // mova z19.s, p2/M, za2v.s[x12]\n"
+ "udot z23.s, z16.b, z24.b\n"
+ ".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
+ ".inst 0xc0828992 // mova z18.s, p2/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
- "addvl x16, x16, #4\n"
"cmp x12, x20\n"
+ "udot z22.s, z17.b, z24.b\n"
+ "udot z21.s, z19.b, z24.b\n"
+ "addvl x9, x9, #4\n"
+ "udot z20.s, z18.b, z24.b\n"
"blt 5b\n"
"incb x28\n"
- "incb x17\n"
"whilelt p9.b, x28, %x[width]\n"
"whilelt p8.b, x28, %x[width]\n"
+ "incb x10\n"
"b.any 2b\n"
- "st1w { z23.s }, p3, [x16]\n"
- "st1w { z22.s }, p3, [x16, #1, MUL VL]\n"
- "st1w { z21.s }, p3, [x16, #2, MUL VL]\n"
- "st1w { z20.s }, p3, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "st1w { z23.s }, p2, [x9]\n"
+ "st1w { z22.s }, p2, [x9, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z20.s }, p2, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "mov %x[out], x9\n"
".inst 0xd503467f // SMSTOP\n"
- "mov %x[out], x16\n"
: [out] "+&r" (out)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
index 09c961f1cd..f6326100b7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
@@ -32,92 +32,92 @@ void interleave_block<4, 1, VLType::SME, false>(
{
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x16, #0x0\n"
- "mov x15, %x[row_offset]\n"
+ "cntw x15\n"
+ "cmp %x[height], x15\n"
"cntw x14\n"
- "cntw x11\n"
- "cmp %x[height], x14\n"
- "cntw x10, ALL, MUL #2\n"
- "cntw x9, ALL, MUL #3\n"
- "csel x14, %x[height], x14, LT\n"
+ "cntw x13, ALL, MUL #2\n"
+ "cntw x11, ALL, MUL #3\n"
+ "csel x15, %x[height], x15, LT\n"
+ "mov x10, #0x0\n"
"ptrue p4.s\n"
- "sub x14, x14, #0x1\n"
+ "sub x15, x15, #0x1\n"
"whilelt p3.s, XZR, %x[height]\n"
- "whilelt p15.s, x11, %x[height]\n"
- "whilelt p14.s, x10, %x[height]\n"
- "whilelt p13.s, x9, %x[height]\n"
+ "whilelt p15.s, x14, %x[height]\n"
+ "whilelt p14.s, x13, %x[height]\n"
+ "whilelt p13.s, x11, %x[height]\n"
+ "mov x9, %x[row_offset]\n"
"mov x28, %x[out]\n"
- "whilelt p12.s, x16, %x[width]\n"
- "whilelt p11.s, x16, %x[width]\n"
- "whilelt p10.s, x16, %x[width]\n"
- "whilelt p9.s, x16, %x[width]\n"
- "whilelt p8.s, x16, %x[width]\n"
+ "whilelt p12.s, x10, %x[width]\n"
+ "whilelt p11.s, x10, %x[width]\n"
+ "whilelt p10.s, x10, %x[width]\n"
+ "whilelt p9.s, x10, %x[width]\n"
+ "whilelt p8.s, x10, %x[width]\n"
"1:" // Width loop
"add x27, %x[in], XZR, LSL #3\n"
- "add x26, %x[in], x11, LSL #3\n"
- "add x25, %x[in], x10, LSL #3\n"
- "add x20, %x[in], x9, LSL #3\n"
- "ldr x24, [x27], #0x8\n"
- "mov x13, #0x0\n"
+ "add x26, %x[in], x14, LSL #3\n"
+ "ldr x25, [x27], #0x8\n"
+ "add x24, %x[in], x13, LSL #3\n"
+ "add x20, %x[in], x11, LSL #3\n"
"ldr x23, [x26], #0x8\n"
- "ldr x22, [x25], #0x8\n"
+ "mov x12, #0x0\n"
+ "ldr x22, [x24], #0x8\n"
"ldr x21, [x20], #0x8\n"
- "cbz x14, 3f\n"
+ "cbz x15, 3f\n"
"2:" // Loads: Loop
- ".inst 0x25316c60 // psel p0.s, p11.s/Z, p3.s[w13]\n"
- ".inst 0x253169e2 // psel p2.s, p10.s/Z, p15.s[w13]\n"
- ".inst 0x253165c1 // psel p1.s, p9.s/Z, p14.s[w13]\n"
- ".inst 0xe08f2300 // ld1w { za0h.s[x13] }, p0/Z, [x24, x15, LSL #2]\n"
- ".inst 0x253161a0 // psel p0.s, p8.s/Z, p13.s[w13]\n"
- "ldr x24, [x27], #0x8\n"
- ".inst 0xe08f2ae4 // ld1w { za1h.s[x13] }, p2/Z, [x23, x15, LSL #2]\n"
+ ".inst 0x25306c60 // psel p0.s, p11.s/Z, p3.s[w12]\n"
+ ".inst 0x253069e2 // psel p2.s, p10.s/Z, p15.s[w12]\n"
+ ".inst 0xe0890320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x9, LSL #2]\n"
+ "ldr x25, [x27], #0x8\n"
+ ".inst 0x253065c1 // psel p1.s, p9.s/Z, p14.s[w12]\n"
+ ".inst 0x253061a0 // psel p0.s, p8.s/Z, p13.s[w12]\n"
+ ".inst 0xe0890ae4 // ld1w { za1h.s[x12] }, p2/Z, [x23, x9, LSL #2]\n"
"ldr x23, [x26], #0x8\n"
- ".inst 0xe08f26c8 // ld1w { za2h.s[x13] }, p1/Z, [x22, x15, LSL #2]\n"
- "ldr x22, [x25], #0x8\n"
- ".inst 0xe08f22ac // ld1w { za3h.s[x13] }, p0/Z, [x21, x15, LSL #2]\n"
- "add x13, x13, #0x1\n"
+ ".inst 0xe08906c8 // ld1w { za2h.s[x12] }, p1/Z, [x22, x9, LSL #2]\n"
+ "ldr x22, [x24], #0x8\n"
+ ".inst 0xe08902ac // ld1w { za3h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x15\n"
"ldr x21, [x20], #0x8\n"
- "cmp x13, x14\n"
"blt 2b\n"
"3:" // Loads: Tail
- ".inst 0x25316c60 // psel p0.s, p11.s/Z, p3.s[w13]\n"
- ".inst 0x253169e2 // psel p2.s, p10.s/Z, p15.s[w13]\n"
- ".inst 0x253165c1 // psel p1.s, p9.s/Z, p14.s[w13]\n"
- "sub x20, %x[width], x16\n"
- "cmp x20, x11\n"
+ "sub x20, %x[width], x10\n"
+ ".inst 0x25306c60 // psel p0.s, p11.s/Z, p3.s[w12]\n"
+ ".inst 0xe0890320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x9, LSL #2]\n"
+ ".inst 0x253069e0 // psel p0.s, p10.s/Z, p15.s[w12]\n"
+ ".inst 0x253065c1 // psel p1.s, p9.s/Z, p14.s[w12]\n"
+ ".inst 0xe08902e4 // ld1w { za1h.s[x12] }, p0/Z, [x23, x9, LSL #2]\n"
+ ".inst 0x253061a0 // psel p0.s, p8.s/Z, p13.s[w12]\n"
+ "cmp x20, x14\n"
+ ".inst 0xe08906c8 // ld1w { za2h.s[x12] }, p1/Z, [x22, x9, LSL #2]\n"
+ ".inst 0xe08902ac // ld1w { za3h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n"
"mov x12, #0x0\n"
- ".inst 0xe08f2300 // ld1w { za0h.s[x13] }, p0/Z, [x24, x15, LSL #2]\n"
- ".inst 0x253161a0 // psel p0.s, p8.s/Z, p13.s[w13]\n"
- "csel x20, x20, x11, LT\n"
- ".inst 0xe08f2ae4 // ld1w { za1h.s[x13] }, p2/Z, [x23, x15, LSL #2]\n"
- ".inst 0xe08f26c8 // ld1w { za2h.s[x13] }, p1/Z, [x22, x15, LSL #2]\n"
- ".inst 0xe08f22ac // ld1w { za3h.s[x13] }, p0/Z, [x21, x15, LSL #2]\n"
+ "csel x20, x20, x14, LT\n"
"4:" // Stores: Loop
".inst 0x25305180 // psel p0.s, p4.s/Z, p12.s[w12]\n"
- ".inst 0x25305182 // psel p2.s, p4.s/Z, p12.s[w12]\n"
- ".inst 0x25305181 // psel p1.s, p4.s/Z, p12.s[w12]\n"
".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
".inst 0x25305180 // psel p0.s, p4.s/Z, p12.s[w12]\n"
- ".inst 0xe0ab8b84 // st1w { za1v.s[x12] }, p2/Z, [x28, x11, LSL #2]\n"
- ".inst 0xe0aa8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x10, LSL #2]\n"
- ".inst 0xe0a9838c // st1w { za3v.s[x12] }, p0/Z, [x28, x9, LSL #2]\n"
+ ".inst 0xe0ae8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x14, LSL #2]\n"
+ ".inst 0x25305181 // psel p1.s, p4.s/Z, p12.s[w12]\n"
+ ".inst 0x25305180 // psel p0.s, p4.s/Z, p12.s[w12]\n"
+ ".inst 0xe0ad8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x13, LSL #2]\n"
+ ".inst 0xe0ab838c // st1w { za3v.s[x12] }, p0/Z, [x28, x11, LSL #2]\n"
"add x12, x12, #0x1\n"
- "addvl x28, x28, #4\n"
"cmp x12, x20\n"
+ "addvl x28, x28, #4\n"
"blt 4b\n"
- "incw x16\n"
- "incw x15\n"
- "whilelt p12.s, x16, %x[width]\n"
- "whilelt p11.s, x16, %x[width]\n"
- "whilelt p10.s, x16, %x[width]\n"
- "whilelt p9.s, x16, %x[width]\n"
- "whilelt p8.s, x16, %x[width]\n"
+ "incw x10\n"
+ "whilelt p12.s, x10, %x[width]\n"
+ "whilelt p11.s, x10, %x[width]\n"
+ "whilelt p10.s, x10, %x[width]\n"
+ "whilelt p9.s, x10, %x[width]\n"
+ "whilelt p8.s, x10, %x[width]\n"
+ "incw x9\n"
"b.any 1b\n"
"mov %x[out], x28\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index da27f31428..7c09608e3e 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -47,7 +47,245 @@
namespace arm_gemm {
-#include "interleave_indirect_impl.hpp"
+/*
+ * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
+ *
+ * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
+ * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
+ * with a particular value.
+ *
+ * Note that it is not expected for this templated version to ever be used - all cases that matter should be
+ * explicitly specialized with an optimized implementation.
+ */
+template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
+void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+ const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ std::vector<int32_t> the_sums;
+
+ if (integrate_sums) {
+ the_sums = std::vector<int32_t>(int_by, 0);
+
+ if (!first) {
+ // In 'integrate sums' mode, we dump the sums at the end on each pass.
+
+ // On the last pass this is correct, but on other passes it is not -
+ // so on the subsequent pass we need to take the output written by
+ // the previous pass as starting point for the sums, and then
+ // overwrite them with new interleaved data.
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ // Rewind pointer to where we wrote out the sums last time.
+ out_int32 -= int_by;
+
+ // Restore the running sums.
+ memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
+
+ // Update the "real" pointer so that the next output will clobber the old sums.
+ out = reinterpret_cast<TOut *>(out_int32);
+ }
+ }
+
+ for (unsigned int pos=0; pos<width; pos+=block) {
+ for (unsigned int row=0; row<int_by; row++) {
+ // Row out of range - pad 'block' entries.
+ if (row >= height) {
+ for (unsigned int col=0; col<block; col++) {
+ *out++ = 0;
+ }
+ continue;
+ }
+
+ for (unsigned int col=0; col<block; col++) {
+ // Column out of range - pad a single entry
+ if (pos + col >= width) {
+ *out++ = 0;
+ continue;
+ }
+
+ if (integrate_sums) {
+ the_sums[row] += in[row][row_offset + pos + col];
+ }
+
+ *out++ = in[row][row_offset + pos + col];
+ }
+ }
+ }
+
+ if (integrate_sums) {
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
+
+ out = reinterpret_cast<TOut *>(out_int32 + int_by);
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
+inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
+ if (row_sum_multiplier) {
+ // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
+ // next block (post sums).
+ // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ out_int32 -= height;
+ for (unsigned int i=0; i<height; i++) {
+ out_int32[i] *= row_sum_multiplier;
+ }
+ } else {
+ // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
+ // sum block. We need to insert the (zero) sums, and advance 'out'.
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ for (unsigned int i=0; i<height; i++) {
+ out_int32[i] = 0;
+ }
+
+ out_int32 += height;
+
+ out = reinterpret_cast<TOut *>(out_int32);
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
+ unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
+ const unsigned int k0, const unsigned int kmax, bool integrate_sums,
+ const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
+ // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
+ // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
+ // pointers and conditionally overriding the out of range ones.
+
+ // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
+ // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
+ // expensive in highly threaded scenarios.
+ const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+ // Figure out the starting position based on k0 (with rounded length)
+ unsigned int start_string = k0 / rounded_stringlen;
+ unsigned int start_stringpos = k0 % rounded_stringlen;
+
+ // Process blocks of 'height' height...
+ for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
+ // Height to process
+ unsigned int active_height = std::min(ymax - ybase, height);
+
+ // Track our progress through the various strings
+ unsigned int k_left = (kmax - k0);
+ unsigned int string = start_string;
+ unsigned int stringpos = start_stringpos;
+
+ bool first = true;
+
+ // Prepare to call 'interleave_block' above for each string encompassed by K range
+ while (k_left > 0) {
+ // Width to process - and the width we will generate (with padding)
+ unsigned int in_width = std::min(k_left, stringlen - stringpos);
+ unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
+
+ const TIn * const *row_base = ptr[string] + ybase;
+
+ // If not all rows are valid, copy the ones that are into local array (see above comment).
+ if (active_height < height) {
+ for (unsigned int i=0; i<active_height; i++) {
+ row_ptrs[i] = ptr[string][ybase + i];
+ }
+
+ row_base = row_ptrs;
+ }
+
+ // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
+ // much code. However, integrated sums make no sense for non-integral types and won't ever be
+ // requested. So put a type trait check here to avoid generating pointless code.
+ if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+ interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
+ } else {
+ interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
+ }
+
+ k_left -= out_width;
+ string++;
+ stringpos=0;
+ first=false;
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums) {
+ FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+ }
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
+ const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
+
+ // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+ const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+ for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
+ // How many of the rows are active - the rest will get padded in interleave_block.
+ unsigned int active_height = std::min(ymax - ybase, height);
+ bool first = true;
+
+ auto conv_rows = conv_cols.process_rows(ybase, active_height);
+
+ while (!conv_rows.finished()) {
+ unsigned int width, offset;
+
+ // Get next set of parameters
+ std::tie(width, offset) = conv_rows.next_block(row_ptrs);
+
+ // Perform the interleave
+ if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+ interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
+ } else {
+ interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
+ }
+
+ first=false;
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums) {
+ FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+ }
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+ const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+ const unsigned int width=kmax-k0;
+
+ for (unsigned int y=y0; y<ymax; y+=height) {
+ for (unsigned int r=0; r<height; r++) {
+ row_ptrs[r] = in + ((y + r) * in_stride);
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+ interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+ } else {
+ interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums) {
+ FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+ }
+ }
+}
#include "indirect-interleaves/list.hpp"
@@ -78,7 +316,6 @@ template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t,
/* AArch64 */
#ifdef __aarch64__
-
/* FP32 */
/* Arm® Neon™/SVE implementation (height 8) */
template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index 011459c157..935958b224 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
@@ -44,8 +44,7 @@ void a32_sgemm_8x6_a55r1(const float *, const float *, float *, int, int, int);
// structure.
class sgemm_8x6 {
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
@@ -64,7 +63,7 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 6, 8> transforms = {};
kern_type kernel = a32_sgemm_8x6;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
index 95c2682bf6..32c9515582 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
@@ -56,347 +56,347 @@ void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel,
int k = ((K+3)/4) - 1;
__asm __volatile (
- "vmov.i32 q4, #0\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]\n"
- "vmov.i32 q5, #0\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]\n"
- "vmov.i32 q6, #0\n"
- "ldr r0, [%[a_ptr], #0x10]\n"
- "vmov.i32 q7, #0\n"
- "ldr r1, [%[a_ptr], #0x14]\n"
- "vmov.i32 q8, #0\n"
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]\n"
+ "vmov.i32 q6, #0\n"
+ "ldr r0, [%[a_ptr], #0x10]\n"
+ "vmov.i32 q7, #0\n"
+ "ldr r1, [%[a_ptr], #0x14]\n"
+ "vmov.i32 q8, #0\n"
ASM_PREFETCH("[%[a_ptr], #0x40]")
- "vmov.i32 q9, #0\n"
+ "vmov.i32 q9, #0\n"
ASM_PREFETCH("[%[b_ptr], #0x40]")
- "vmov.i32 q10, #0\n"
+ "vmov.i32 q10, #0\n"
ASM_PREFETCH("[%[a_ptr], #0x80]")
- "vmov.i32 q11, #0\n"
+ "vmov.i32 q11, #0\n"
ASM_PREFETCH("[%[b_ptr], #0x80]")
- "vmov.i32 q12, #0\n"
- "vmov.i32 q13, #0\n"
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n"
ASM_PREFETCH("[%[a_ptr], #0xC0]")
- "vmov.i32 q14, #0\n"
+ "vmov.i32 q14, #0\n"
ASM_PREFETCH("[%[b_ptr], #0XC0]")
- "vmov.i32 q15, #0\n"
- "cmp %[k], #0\n"
- "beq 6f\n"
+ "vmov.i32 q15, #0\n"
+ "cmp %[k], #0\n"
+ "beq 6f\n"
"1:\n"
// Unroll 0
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmov d2, r0, r1\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "ldr r0, [%[b_ptr], #0x18]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "ldr r1, [%[b_ptr], #0x1C]\n"
- "vmla.f32 q6, q2, d1[0]\n"
-
- "vldr d3, [%[a_ptr], #0x18]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d1[1]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmov d2, r0, r1\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "ldr r0, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "ldr r1, [%[b_ptr], #0x1C]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d1[1]\n"
ASM_PREFETCH("[%[a_ptr], #0x100]")
- "vmla.f32 q8, q2, d2[0]\n"
- "vmla.f32 q9, q2, d2[1]\n"
-
- "vldr d4, [%[b_ptr], #0x20]\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "ldr r0, [%[b_ptr], #0x28]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "ldr r1, [%[b_ptr], #0x2C]\n"
- "vmla.f32 q12, q3, d1[0]\n"
-
- "vldr d0, [%[a_ptr], #0x20]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "ldr r0, [%[a_ptr], #0x28]\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "ldr r1, [%[a_ptr], #0x2C]\n"
- "vmla.f32 q15, q3, d2[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x20]\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "ldr r0, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "ldr r1, [%[b_ptr], #0x2C]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "ldr r0, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "ldr r1, [%[a_ptr], #0x2C]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
// Unroll 1
- "vldr d6, [%[b_ptr], #0x30]\n"
- "vmov d1, r0, r1\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "ldr r0, [%[b_ptr], #0x38]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "ldr r1, [%[b_ptr], #0x3C]\n"
- "vmla.f32 q6, q2, d0[0]\n"
-
- "vldr d2, [%[a_ptr], #0x30]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d0[1]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmov d1, r0, r1\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "ldr r0, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "ldr r1, [%[b_ptr], #0x3C]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d0[1]\n"
ASM_PREFETCH("[%[b_ptr], #0x100]")
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
-
- "vldr d4, [%[b_ptr], #0x40]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "ldr r0, [%[b_ptr], #0x48]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "ldr r1, [%[b_ptr], #0x4C]\n"
- "vmla.f32 q12, q3, d0[0]\n"
-
- "vldr d3, [%[a_ptr], #0x38]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "ldr r0, [%[a_ptr], #0x40]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "ldr r1, [%[a_ptr], #0x44]\n"
- "vmla.f32 q15, q3, d1[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "ldr r0, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "ldr r1, [%[b_ptr], #0x4C]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "ldr r0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "ldr r1, [%[a_ptr], #0x44]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
// Unroll 2
- "vldr d6, [%[b_ptr], #0x50]\n"
- "vmov d0, r0, r1\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "ldr r0, [%[b_ptr], #0x58]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "ldr r1, [%[b_ptr], #0x5C]\n"
- "vmla.f32 q6, q2, d3[0]\n"
-
- "vldr d1, [%[a_ptr], #0x48]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d3[1]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmov d0, r0, r1\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "ldr r0, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "ldr r1, [%[b_ptr], #0x5C]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d3[1]\n"
ASM_PREFETCH("[%[a_ptr], #0x140]")
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
-
- "vldr d4, [%[b_ptr], #0x60]\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "ldr r0, [%[b_ptr], #0x68]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "ldr r1, [%[b_ptr], #0x6C]\n"
- "vmla.f32 q12, q3, d3[0]\n"
-
- "vldr d2, [%[a_ptr], #0x50]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "ldr r0, [%[a_ptr], #0x58]\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "ldr r1, [%[a_ptr], #0x5C]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "ldr r0, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "ldr r1, [%[b_ptr], #0x6C]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "ldr r0, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "ldr r1, [%[a_ptr], #0x5C]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
// Unroll 3
- "vldr d6, [%[b_ptr], #0x70]\n"
- "vmov d3, r0, r1\n"
- "vmla.f32 q4, q2, d1[0]\n"
- "ldr r0, [%[b_ptr], #0x78]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "ldr r1, [%[b_ptr], #0x7C]\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
-
- "vldr d0, [%[a_ptr], #0x00]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d2[1]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmov d3, r0, r1\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "ldr r0, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "ldr r1, [%[b_ptr], #0x7C]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+
+ "vldr d0, [%[a_ptr], #0x00]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d2[1]\n"
ASM_PREFETCH("[%[b_ptr], #0xC0]")
- "vmla.f32 q8, q2, d3[0]\n"
- "vmla.f32 q9, q2, d3[1]\n"
-
- "vldr d4, [%[b_ptr], #0x00]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "ldr r0, [%[b_ptr], #0x08]\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "ldr r1, [%[b_ptr], #0x0C]\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "subs %[k], %[k], #1\n"
-
- "vldr d1, [%[a_ptr], #0x08]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "ldr r0, [%[a_ptr], #0x10]\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "ldr r1, [%[a_ptr], #0x14]\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "bne 1b\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x00]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "ldr r0, [%[b_ptr], #0x08]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "ldr r1, [%[b_ptr], #0x0C]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "subs %[k], %[k], #1\n"
+
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "ldr r0, [%[a_ptr], #0x10]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "ldr r1, [%[a_ptr], #0x14]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
// "Tails" shows how many multiply blocks are needed at the
// end, must be 1-4 inclusive. Bail out to alternative tail
// immediately if it's 1.
"6:\n"
- "subs %[tails], %[tails], #1\n"
- "beq 3f\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
// Detached final iteration - for now adapt the generic
// tails rather than reimplementing for A53.
// Unroll 0
- "vmov d2, r0, r1\n"
- "add %[a_ptr], %[a_ptr], #0x18\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "add %[b_ptr], %[b_ptr], #0x10\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 4f\n"
+ "vmov d2, r0, r1\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x10\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
// Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 5f\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
// Unroll 2
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
// Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
// tails==1 final tail
"3:\n"
- "vmov d2, r0, r1\n"
- "add %[b_ptr], %[b_ptr], #0x10\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "add %[a_ptr], %[a_ptr], #0x18\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
+ "vmov d2, r0, r1\n"
+ "add %[b_ptr], %[b_ptr], #0x10\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
// tails==2 final tail
"4:\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
// tails==3 final tail
"5:\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vld1.32 {d0}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
"2:\n"
- "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
: [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
index 54e0a26843..972b66ed50 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
@@ -62,345 +62,345 @@ void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel
a_ptr = a_ptr0;
__asm __volatile (
- "vldr d0, [%[a_ptr]]\n"
- "vmov.i32 q4, #0\n"
- "vldr d1, [%[a_ptr], #0x08]\n"
- "vmov.i32 q5, #0\n"
- "vldr d4, [%[b_ptr]]\n"
- "vmov.i32 q6, #0\n"
- "vldr d5, [%[b_ptr], #0x08]\n"
- "vmov.i32 q7, #0\n"
- "vldr d2, [%[a_ptr], #0x10]\n"
- "vmov.i32 q8, #0\n"
+ "vldr d0, [%[a_ptr]]\n"
+ "vmov.i32 q4, #0\n"
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmov.i32 q5, #0\n"
+ "vldr d4, [%[b_ptr]]\n"
+ "vmov.i32 q6, #0\n"
+ "vldr d5, [%[b_ptr], #0x08]\n"
+ "vmov.i32 q7, #0\n"
+ "vldr d2, [%[a_ptr], #0x10]\n"
+ "vmov.i32 q8, #0\n"
ASM_PREFETCH("[%[b_ptr], #0x40]")
- "vmov.i32 q9, #0\n"
+ "vmov.i32 q9, #0\n"
ASM_PREFETCH("[%[a_ptr], #0x40]")
- "vmov.i32 q10, #0\n"
+ "vmov.i32 q10, #0\n"
ASM_PREFETCH("[%[b_ptr], #0x80]")
- "vmov.i32 q11, #0\n"
+ "vmov.i32 q11, #0\n"
ASM_PREFETCH("[%[a_ptr], #0x80]")
- "vmov.i32 q12, #0\n"
+ "vmov.i32 q12, #0\n"
ASM_PREFETCH("[%[b_ptr], #0XC0]")
- "vmov.i32 q13, #0\n"
+ "vmov.i32 q13, #0\n"
ASM_PREFETCH("[%[a_ptr], #0xC0]")
- "vmov.i32 q14, #0\n"
+ "vmov.i32 q14, #0\n"
ASM_PREFETCH("[%[b_ptr], #0x100]")
- "vmov.i32 q15, #0\n"
+ "vmov.i32 q15, #0\n"
ASM_PREFETCH("[%[a_ptr], #0x100]")
- "cmp %[k], #0\n"
+ "cmp %[k], #0\n"
ASM_PREFETCH("[%[b_ptr], #0x140]")
- "beq 6f\n"
+ "beq 6f\n"
ASM_PREFETCH("[%[b_ptr], #0x180]")
"1:\n"
// Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vldr d7, [%[b_ptr], #0x18]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vldr d3, [%[a_ptr], #0x18]\n"
- "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
ASM_PREFETCH("[%[a_ptr], #0x140]")
- "vmla.f32 q8, q2, d2[0]\n"
- "subs %[k], %[k], #1\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vldr d4, [%[b_ptr], #0x20]\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vldr d5, [%[b_ptr], #0x28]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x20]\n"
- "vmla.f32 q12, q3, d1[0]\n"
-
- "vmla.f32 q13, q3, d1[1]\n"
- "vldr d1, [%[a_ptr], #0x28]\n"
- "vmla.f32 q14, q3, d2[0]\n"
-
- "vmla.f32 q15, q3, d2[1]\n"
- "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vldr d4, [%[b_ptr], #0x20]\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vldr d5, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
// Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vldr d7, [%[b_ptr], #0x38]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vldr d2, [%[a_ptr], #0x30]\n"
- "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
ASM_PREFETCH("[%[b_ptr], #0x1C0]")
- "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vldr d4, [%[b_ptr], #0x40]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vldr d5, [%[b_ptr], #0x48]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vldr d3, [%[a_ptr], #0x38]\n"
- "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vldr d5, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x40]\n"
- "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
// Unroll 2
- "vmla.f32 q4, q2, d2[0]\n"
- "vldr d7, [%[b_ptr], #0x58]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vldr d1, [%[a_ptr], #0x48]\n"
- "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
ASM_PREFETCH("[%[a_ptr], #0x180]")
- "vmla.f32 q8, q2, d0[0]\n"
-
- "vmla.f32 q9, q2, d0[1]\n"
- "vldr d4, [%[b_ptr], #0x60]\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vldr d5, [%[b_ptr], #0x68]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vldr d2, [%[a_ptr], #0x50]\n"
- "vmla.f32 q12, q3, d3[0]\n"
-
- "vmla.f32 q13, q3, d3[1]\n"
- "vldr d3, [%[a_ptr], #0x58]\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "add %[a_ptr], %[a_ptr], #0x60\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vldr d5, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
// Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vldr d7, [%[b_ptr], #0x78]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vldr d0, [%[a_ptr], #0x00]\n"
- "vmla.f32 q7, q2, d2[1]\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vldr d7, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vldr d0, [%[a_ptr], #0x00]\n"
+ "vmla.f32 q7, q2, d2[1]\n"
ASM_PREFETCH("[%[b_ptr], #0x180]")
- "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vldr d4, [%[b_ptr], #0x00]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vldr d5, [%[b_ptr], #0x08]\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vldr d1, [%[a_ptr], #0x08]\n"
- "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vldr d4, [%[b_ptr], #0x00]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vldr d5, [%[b_ptr], #0x08]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vldr d2, [%[a_ptr], #0x10]\n"
- "vmla.f32 q14, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vldr d2, [%[a_ptr], #0x10]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "bne 1b\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
// "Tails" shows how many multiply blocks are needed at the
// end, must be 1-4 inclusive. Bail out to alternative tail
// immediately if it's 1.
"6:\n"
- "subs %[tails], %[tails], #1\n"
- "beq 3f\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
// Detached final iteration
// Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vldr d7, [%[b_ptr], #0x18]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vldr d3, [%[a_ptr], #0x18]\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vldr d4, [%[b_ptr], #0x20]\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vldr d5, [%[b_ptr], #0x28]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x20]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vldr d1, [%[a_ptr], #0x28]\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "beq 4f\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vldr d4, [%[b_ptr], #0x20]\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vldr d5, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "beq 4f\n"
// Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vldr d6, [%[b_ptr], #0x30]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vldr d7, [%[b_ptr], #0x38]\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vldr d2, [%[a_ptr], #0x30]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q8, q2, d1[0]\n"
-
- "vmla.f32 q9, q2, d1[1]\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vldr d4, [%[b_ptr], #0x40]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vldr d5, [%[b_ptr], #0x48]\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vldr d3, [%[a_ptr], #0x38]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x40]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "beq 5f\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+
+ "vmla.f32 q9, q2, d1[1]\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vldr d5, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "beq 5f\n"
// Unroll 2
- "vmla.f32 q4, q2, d2[0]\n"
- "vldr d6, [%[b_ptr], #0x50]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vldr d7, [%[b_ptr], #0x58]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vldr d1, [%[a_ptr], #0x48]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vldr d4, [%[b_ptr], #0x60]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vldr d5, [%[b_ptr], #0x68]\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vldr d2, [%[a_ptr], #0x50]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vldr d3, [%[a_ptr], #0x58]\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vldr d5, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
// Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vldr d6, [%[b_ptr], #0x70]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vldr d7, [%[b_ptr], #0x78]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x60\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vldr d7, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+ "b 2f\n"
// tails==1 final tail
"3:\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vldr d7, [%[b_ptr], #0x18]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x18\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x20\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x20\n"
+ "b 2f\n"
// tails==2 final tail
"4:\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "vldr d6, [%[b_ptr], #0x30]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vldr d7, [%[b_ptr], #0x38]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x30\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x30\n"
+ "b 2f\n"
// tails==3 final tail
"5:\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vldr d6, [%[b_ptr], #0x50]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vldr d7, [%[b_ptr], #0x58]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x48\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x48\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x60\n"
"2:\n"
- "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
: [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
index b230dc1fb7..877247e052 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
@@ -56,110 +56,110 @@ void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int
int k = ((K+3)/4) - 1;
__asm __volatile (
- "vmov.i32 q4, #0\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmov.i32 q5, #0\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
- "vmov.i32 q6, #0\n"
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+ "vmov.i32 q6, #0\n"
ASM_PREFETCH("[%[a_ptr], #48]")
- "vmov.i32 q7, #0\n"
+ "vmov.i32 q7, #0\n"
ASM_PREFETCH("[%[b_ptr], #48]")
- "vmov.i32 q8, #0\n"
+ "vmov.i32 q8, #0\n"
ASM_PREFETCH("[%[a_ptr], #112]")
- "vmov.i32 q9, #0\n"
+ "vmov.i32 q9, #0\n"
ASM_PREFETCH("[%[b_ptr], #112]")
- "vmov.i32 q10, #0\n"
- "vmov.i32 q11, #0\n"
- "vmov.i32 q12, #0\n"
- "vmov.i32 q13, #0\n"
+ "vmov.i32 q10, #0\n"
+ "vmov.i32 q11, #0\n"
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n"
ASM_PREFETCH("[%[a_ptr], #176]")
- "vmov.i32 q14, #0\n"
+ "vmov.i32 q14, #0\n"
ASM_PREFETCH("[%[b_ptr], #176]")
- "vmov.i32 q15, #0\n"
+ "vmov.i32 q15, #0\n"
- "cmp %[k], #0\n"
- "beq 6f\n"
+ "cmp %[k], #0\n"
+ "beq 6f\n"
"1:\n"
// Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
// Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "subs %[k], %[k], #1\n"
- "vmla.f32 q5, q2, d3[1]\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q5, q2, d3[1]\n"
ASM_PREFETCH("[%[a_ptr], #208]")
- "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
// Unroll 2
- "vmla.f32 q4, q2, d2[0]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
ASM_PREFETCH("[%[a_ptr], #240]")
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
ASM_PREFETCH("[%[b_ptr], #208]")
- "vmla.f32 q12, q3, d3[0]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
// Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d1[0]\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "bne 1b\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
// Branch here if we never execute main loop.
"6:\n"
@@ -167,182 +167,182 @@ void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int
// "Tails" shows how many multiply blocks are needed at the
// end, must be 1-4 inclusive. Bail out to alternative tail
// immediately if it's 1.
- "subs %[tails], %[tails], #1\n"
- "beq 3f\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
// Detached final iteration
// Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 4f\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
// Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 5f\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
// Unroll 2
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
// Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
// tails==1 final tail
"3:\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d2}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
// tails==2 final tail
"4:\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
// tails==3 final tail
"5:\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vld1.32 {d0}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
"2:\n"
- "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
: [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
index fe939b1084..8d4146ab3a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
@@ -82,7 +82,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
index 022a34fdcd..d713ca5d53 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -50,19 +50,18 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -83,7 +82,6 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -108,19 +106,19 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"bgt 77f\n"
"beq 39f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 3f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -133,15 +131,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cbz x15, 4f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x15, x15, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"b 16f\n"
@@ -232,8 +230,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"17:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 18f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -253,11 +251,7 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x12, #0x10]\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Main loop head
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v20.2d, v1.2d, v21.2d\n"
- "trn2 v1.2d, v1.2d, v21.2d\n"
".inst 0x6e47ee88 // bfmmla v8.4s, v20.8h, v7.8h\n"
"ldr q17, [x11, #0x0]\n"
".inst 0x6e46ee8c // bfmmla v12.4s, v20.8h, v6.8h\n"
@@ -270,37 +264,38 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x9, #0x0]\n"
".inst 0x6e51ee8e // bfmmla v14.4s, v20.8h, v17.8h\n"
"ldr q17, [x9, #0x10]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
".inst 0x6e52ee8b // bfmmla v11.4s, v20.8h, v18.8h\n"
"ldr q18, [x12, #0x20]\n"
".inst 0x6e51ee8f // bfmmla v15.4s, v20.8h, v17.8h\n"
"ldr q17, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
"ldr q18, [x11, #0x20]\n"
".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
"ldr q17, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
"ldr q18, [x9, #0x20]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
- "ldr q7, [x12, #0x0]\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x12, #0x0]\n"
+ "add x11, x11, #0x40\n"
"ldr q6, [x12, #0x10]\n"
+ "add x10, x10, #0x40\n"
+ "add x9, x9, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Single iteration only
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "trn1 v19.2d, v1.2d, v17.2d\n"
- "trn2 v1.2d, v1.2d, v17.2d\n"
+ "trn1 v19.2d, v1.2d, v20.2d\n"
".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
"ldr q17, [x11, #0x0]\n"
".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
@@ -312,58 +307,61 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
"ldr q17, [x9, #0x0]\n"
".inst 0x6e52ee6e // bfmmla v14.4s, v19.8h, v18.8h\n"
- "ldr q25, [x9, #0x10]\n"
+ "ldr q24, [x9, #0x10]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
".inst 0x6e51ee6b // bfmmla v11.4s, v19.8h, v17.8h\n"
- "ldr q17, [x12, #0x20]\n"
- ".inst 0x6e59ee6f // bfmmla v15.4s, v19.8h, v25.8h\n"
- "ldr q3, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
- ".inst 0x6e51ec28 // bfmmla v8.4s, v1.8h, v17.8h\n"
+ "ldr q18, [x12, #0x20]\n"
+ ".inst 0x6e58ee6f // bfmmla v15.4s, v19.8h, v24.8h\n"
+ "ldr q17, [x12, #0x30]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
"ldr q19, [x11, #0x20]\n"
- ".inst 0x6e43ec2c // bfmmla v12.4s, v1.8h, v3.8h\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
"ldr q17, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e53ec29 // bfmmla v9.4s, v1.8h, v19.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
"ldr q18, [x9, #0x20]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
+ "add x26, x26, #0x10\n"
+ "add x12, x12, #0x40\n"
+ "add x11, x11, #0x40\n"
+ "add x10, x10, #0x40\n"
+ "add x9, x9, #0x40\n"
"22:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 27f\n"
"cmp x27, #0x4\n"
"blt 24f\n"
"23:" // Height 1: Multiply loop: Odd block loop
"ldr d19, [x26], #0x8\n"
- "ldr q20, [x12, #0x0]\n"
- "sub x27, x27, #0x4\n"
+ "ldr q18, [x12, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
"ldr q17, [x12, #0x10]\n"
- "cmp x27, #0x4\n"
- "add x12, x12, #0x20\n"
- "trn1 v19.2d, v19.2d, v18.2d\n"
- ".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
"ldr q18, [x11, #0x0]\n"
".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
"ldr q17, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
"ldr q18, [x10, #0x0]\n"
".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
"ldr q18, [x9, #0x0]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
"ldr q17, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
"bge 23b\n"
"24:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 27f\n"
@@ -377,25 +375,25 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"26:" // Height 1: Multiply loop: Ragged operand read: Done
"ldr q20, [x12, #0x0]\n"
"ldr q18, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
"trn1 v19.2d, v1.2d, v17.2d\n"
".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n"
"ldr q17, [x11, #0x0]\n"
".inst 0x6e52ee6c // bfmmla v12.4s, v19.8h, v18.8h\n"
"ldr q18, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x0]\n"
".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n"
- "ldr q2, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
+ "ldr q6, [x10, #0x10]\n"
".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
"ldr q18, [x9, #0x0]\n"
- ".inst 0x6e42ee6e // bfmmla v14.4s, v19.8h, v2.8h\n"
+ ".inst 0x6e46ee6e // bfmmla v14.4s, v19.8h, v6.8h\n"
"ldr q17, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
"27:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -406,9 +404,9 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp1 v10.2d, v10.2d, v14.2d\n"
"uzp1 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 28f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v18.4s }, [x21]\n"
"ld1r { v17.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v18.4s\n"
"fmin v9.4s, v9.4s, v18.4s\n"
@@ -479,19 +477,19 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"b 230f\n"
"39:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"40:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 41f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -504,15 +502,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cbz x15, 42f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x15, x15, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"b 54f\n"
@@ -520,75 +518,75 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"tbz %x[flags], #0, 53f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"bge 51f\n"
"tbz x14, #3, 46f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x14, #2, 44f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x14, #1, 43f\n"
"ldr d16, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
"tbz x14, #0, 50f\n"
"ld1 { v16.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 50f\n"
"43:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 50f\n"
"ldr s16, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 50f\n"
"44:" // Height 2: Partial accumulate: partial_2_8
"tbz x14, #1, 45f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
"tbz x14, #0, 50f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 50f\n"
"45:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 50f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 50f\n"
"46:" // Height 2: Partial accumulate: partial_4_0
"tbz x14, #2, 48f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x14, #1, 47f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
"tbz x14, #0, 50f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 50f\n"
"47:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 50f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 50f\n"
"48:" // Height 2: Partial accumulate: partial_2_0
"tbz x14, #1, 49f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
"tbz x14, #0, 50f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 50f\n"
"49:" // Height 2: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"50:" // Height 2: Partial accumulate: Done
"sub x13, x13, x20\n"
@@ -598,10 +596,10 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x13, #0x10]\n"
"ldr q11, [x13, #0x20]\n"
"ldr q16, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"52:" // Height 2: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -625,8 +623,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"55:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 56f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -651,12 +649,6 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"blt 59f\n"
"58:" // Height 2: Multiply loop: Main loop head
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "ldr q2, [x25, #0x0]\n"
".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
"ldr q18, [x11, #0x0]\n"
".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
@@ -669,38 +661,40 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x9, #0x0]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
"ldr q17, [x9, #0x10]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
"ldr q18, [x12, #0x20]\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"ldr q17, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
"ldr q18, [x11, #0x20]\n"
".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
"ldr q17, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
"ldr q18, [x9, #0x20]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "ldr q2, [x25, #0x0]\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ "add x12, x12, #0x40\n"
"ldr q7, [x12, #0x0]\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x12, #0x10]\n"
+ "add x11, x11, #0x40\n"
+ "add x10, x10, #0x40\n"
+ "add x9, x9, #0x40\n"
"bge 58b\n"
"59:" // Height 2: Multiply loop: Single iteration only
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
"ldr q18, [x11, #0x0]\n"
".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
@@ -713,58 +707,62 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x9, #0x0]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
"ldr q17, [x9, #0x10]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
"ldr q18, [x12, #0x20]\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"ldr q17, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
"ldr q18, [x11, #0x20]\n"
".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
"ldr q17, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
"ldr q18, [x9, #0x20]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x12, x12, #0x40\n"
+ "add x11, x11, #0x40\n"
+ "add x10, x10, #0x40\n"
+ "add x9, x9, #0x40\n"
"60:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 65f\n"
"cmp x27, #0x4\n"
"blt 62f\n"
"61:" // Height 2: Multiply loop: Odd block loop
- "ldr d20, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x4\n"
"ldr q18, [x12, #0x0]\n"
"ldr q17, [x12, #0x10]\n"
- "cmp x27, #0x4\n"
- "add x12, x12, #0x20\n"
- "trn1 v19.2d, v20.2d, v19.2d\n"
".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
- "ldr q26, [x11, #0x0]\n"
".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q26, [x11, #0x0]\n"
"ldr q6, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e5aee69 // bfmmla v9.4s, v19.8h, v26.8h\n"
- "ldr q18, [x10, #0x0]\n"
".inst 0x6e46ee6d // bfmmla v13.4s, v19.8h, v6.8h\n"
+ "ldr q18, [x10, #0x0]\n"
"ldr q17, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
- "ldr q18, [x9, #0x0]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x9, #0x0]\n"
"ldr q17, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
+ "cmp x27, #0x4\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
"bge 61b\n"
"62:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 65f\n"
@@ -782,24 +780,24 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x12, #0x0]\n"
"ldr q17, [x12, #0x10]\n"
"trn1 v19.2d, v1.2d, v2.2d\n"
- "add x12, x12, #0x20\n"
".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
"ldr q18, [x11, #0x0]\n"
".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
"ldr q17, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
- "ldr q30, [x10, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
- "ldr q31, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
- ".inst 0x6e5eee6a // bfmmla v10.4s, v19.8h, v30.8h\n"
+ "ldr q27, [x10, #0x10]\n"
+ ".inst 0x6e43ee6a // bfmmla v10.4s, v19.8h, v3.8h\n"
"ldr q18, [x9, #0x0]\n"
- ".inst 0x6e5fee6e // bfmmla v14.4s, v19.8h, v31.8h\n"
+ ".inst 0x6e5bee6e // bfmmla v14.4s, v19.8h, v27.8h\n"
"ldr q17, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
"65:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -808,17 +806,17 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
+ "add x25, x13, x20, LSL #2\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x26, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 66f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v18.4s }, [x21]\n"
"ld1r { v17.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v18.4s\n"
"fmin v12.4s, v12.4s, v18.4s\n"
@@ -842,63 +840,63 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"tbz x14, #3, 70f\n"
"st1 { v7.4s }, [x13], #0x10\n"
"st1 { v12.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
"tbz x14, #2, 68f\n"
"st1 { v13.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
"tbz x14, #1, 67f\n"
"str d14, [x13], #0x8\n"
- "str d11, [x26], #0x8\n"
+ "str d11, [x25], #0x8\n"
"tbz x14, #0, 74f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x26]\n"
+ "st1 { v11.s }[2], [x25]\n"
"b 74f\n"
"67:" // Height 2: Partial direct writeback: partial_1_12
"tbz x14, #0, 74f\n"
"str s14, [x13, #0x0]\n"
- "str s11, [x26, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
"b 74f\n"
"68:" // Height 2: Partial direct writeback: partial_2_8
"tbz x14, #1, 69f\n"
"str d13, [x13], #0x8\n"
- "str d10, [x26], #0x8\n"
+ "str d10, [x25], #0x8\n"
"tbz x14, #0, 74f\n"
"st1 { v13.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x26]\n"
+ "st1 { v10.s }[2], [x25]\n"
"b 74f\n"
"69:" // Height 2: Partial direct writeback: partial_1_8
"tbz x14, #0, 74f\n"
"str s13, [x13, #0x0]\n"
- "str s10, [x26, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
"b 74f\n"
"70:" // Height 2: Partial direct writeback: partial_4_0
"tbz x14, #2, 72f\n"
"st1 { v7.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
"tbz x14, #1, 71f\n"
"str d12, [x13], #0x8\n"
- "str d9, [x26], #0x8\n"
+ "str d9, [x25], #0x8\n"
"tbz x14, #0, 74f\n"
"st1 { v12.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x26]\n"
+ "st1 { v9.s }[2], [x25]\n"
"b 74f\n"
"71:" // Height 2: Partial direct writeback: partial_1_4
"tbz x14, #0, 74f\n"
"str s12, [x13, #0x0]\n"
- "str s9, [x26, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
"b 74f\n"
"72:" // Height 2: Partial direct writeback: partial_2_0
"tbz x14, #1, 73f\n"
"str d7, [x13], #0x8\n"
- "str d8, [x26], #0x8\n"
+ "str d8, [x25], #0x8\n"
"tbz x14, #0, 74f\n"
"st1 { v7.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x26]\n"
+ "st1 { v8.s }[2], [x25]\n"
"b 74f\n"
"73:" // Height 2: Partial direct writeback: partial_1_0
"str s7, [x13, #0x0]\n"
- "str s8, [x26, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
"74:" // Height 2: Partial direct writeback: Done
"b 76f\n"
"75:" // Height 2: Full writeback
@@ -907,29 +905,29 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"str q13, [x13, #0x20]\n"
"str q14, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
"76:" // Height 2: Writeback done
"subs x14, x14, #0x10\n"
"bgt 40b\n"
"b 230f\n"
"77:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"78:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 79f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -942,15 +940,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cbz x15, 80f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x15, x15, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -965,94 +963,94 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"80:" // Height 3: no bias
"tbz %x[flags], #0, 91f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
"cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 89f\n"
"tbz x14, #3, 84f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x14, #2, 82f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
"tbz x14, #1, 81f\n"
"ldr d16, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
"tbz x14, #0, 88f\n"
"ld1 { v16.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
"b 88f\n"
"81:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 88f\n"
"ldr s16, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
"b 88f\n"
"82:" // Height 3: Partial accumulate: partial_2_8
"tbz x14, #1, 83f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x14, #0, 88f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 88f\n"
"83:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 88f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 88f\n"
"84:" // Height 3: Partial accumulate: partial_4_0
"tbz x14, #2, 86f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x14, #1, 85f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x14, #0, 88f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 88f\n"
"85:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 88f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 88f\n"
"86:" // Height 3: Partial accumulate: partial_2_0
"tbz x14, #1, 87f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x14, #0, 88f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 88f\n"
"87:" // Height 3: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"88:" // Height 3: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 90f\n"
@@ -1061,14 +1059,14 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x13, #0x10]\n"
"ldr q11, [x13, #0x20]\n"
"ldr q16, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
"90:" // Height 3: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -1108,8 +1106,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"93:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 94f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1139,54 +1137,54 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"96:" // Height 3: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x11, #0x10]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x9, #0x0]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x9, #0x10]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x12, #0x20]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x12, #0x30]\n"
"ldr q2, [x25, #0x0]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x11, #0x20]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
"ldr q26, [x9, #0x20]\n"
".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
@@ -1205,52 +1203,52 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"97:" // Height 3: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x11, #0x10]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x9, #0x0]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x9, #0x10]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x12, #0x20]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x11, #0x20]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
"ldr q26, [x9, #0x20]\n"
".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
@@ -1266,40 +1264,40 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 100f\n"
"99:" // Height 3: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sub x27, x27, #0x4\n"
- "ldr d27, [x24], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
"ldr q26, [x12, #0x0]\n"
- "cmp x27, #0x4\n"
- "ldr q25, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v27.2d, v29.2d\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x11, #0x0]\n"
".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
"ldr q25, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "sub x27, x27, #0x4\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "cmp x27, #0x4\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x9, #0x0]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"bge 99b\n"
"100:" // Height 3: Multiply loop: Skip odd blocks
@@ -1321,18 +1319,18 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q26, [x12, #0x0]\n"
"ldr q29, [x12, #0x10]\n"
"trn1 v28.2d, v1.2d, v2.2d\n"
- "add x12, x12, #0x20\n"
"trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
- ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n"
".inst 0x6e5def74 // bfmmla v20.4s, v27.8h, v29.8h\n"
"ldr q25, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
@@ -1354,24 +1352,24 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 93b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x13, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
"uzp1 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 104f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v26.4s }, [x21]\n"
"ld1r { v25.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v26.4s\n"
"fmin v12.4s, v12.4s, v26.4s\n"
@@ -1403,79 +1401,79 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"tbz x14, #3, 108f\n"
"st1 { v7.4s }, [x13], #0x10\n"
"st1 { v12.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x14, #2, 106f\n"
"st1 { v13.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x14, #1, 105f\n"
"str d14, [x13], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x14, #0, 112f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 112f\n"
"105:" // Height 3: Partial direct writeback: partial_1_12
"tbz x14, #0, 112f\n"
"str s14, [x13, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 112f\n"
"106:" // Height 3: Partial direct writeback: partial_2_8
"tbz x14, #1, 107f\n"
"str d13, [x13], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x14, #0, 112f\n"
"st1 { v13.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 112f\n"
"107:" // Height 3: Partial direct writeback: partial_1_8
"tbz x14, #0, 112f\n"
"str s13, [x13, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 112f\n"
"108:" // Height 3: Partial direct writeback: partial_4_0
"tbz x14, #2, 110f\n"
"st1 { v7.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x14, #1, 109f\n"
"str d12, [x13], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x14, #0, 112f\n"
"st1 { v12.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 112f\n"
"109:" // Height 3: Partial direct writeback: partial_1_4
"tbz x14, #0, 112f\n"
"str s12, [x13, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 112f\n"
"110:" // Height 3: Partial direct writeback: partial_2_0
"tbz x14, #1, 111f\n"
"str d7, [x13], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x14, #0, 112f\n"
"st1 { v7.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 112f\n"
"111:" // Height 3: Partial direct writeback: partial_1_0
"str s7, [x13, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"112:" // Height 3: Partial direct writeback: Done
"b 114f\n"
"113:" // Height 3: Full writeback
@@ -1484,33 +1482,33 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"str q13, [x13, #0x20]\n"
"str q14, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"114:" // Height 3: Writeback done
"subs x14, x14, #0x10\n"
"bgt 78b\n"
"b 230f\n"
"115:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"116:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 117f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -1523,15 +1521,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cbz x15, 118f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x15, x15, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -1546,111 +1544,111 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"118:" // Height 4: no bias
"tbz %x[flags], #0, 129f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x14, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 127f\n"
"tbz x14, #3, 122f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x14, #2, 120f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x14, #1, 119f\n"
"ldr d16, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x14, #0, 126f\n"
"ld1 { v16.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 126f\n"
"119:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 126f\n"
"ldr s16, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 126f\n"
"120:" // Height 4: Partial accumulate: partial_2_8
"tbz x14, #1, 121f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x14, #0, 126f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 126f\n"
"121:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 126f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 126f\n"
"122:" // Height 4: Partial accumulate: partial_4_0
"tbz x14, #2, 124f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x14, #1, 123f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x14, #0, 126f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 126f\n"
"123:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 126f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 126f\n"
"124:" // Height 4: Partial accumulate: partial_2_0
"tbz x14, #1, 125f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x14, #0, 126f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 126f\n"
"125:" // Height 4: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"126:" // Height 4: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 128f\n"
@@ -1659,18 +1657,18 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x13, #0x10]\n"
"ldr q11, [x13, #0x20]\n"
"ldr q16, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"128:" // Height 4: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -1710,8 +1708,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"131:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 132f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1745,56 +1743,56 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"134:" // Height 4: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "cmp x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x11, #0x10]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x9, #0x0]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x9, #0x10]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x12, #0x20]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ "add x23, x23, #0x10\n"
+ "ldr q4, [x23, #0x0]\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x12, #0x30]\n"
- "ldr q2, [x25, #0x0]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ "ldr q2, [x25, #0x0]\n"
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x11, #0x20]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
"ldr q26, [x9, #0x20]\n"
".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
@@ -1813,53 +1811,53 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"135:" // Height 4: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x11, #0x10]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x9, #0x0]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x9, #0x10]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x12, #0x20]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x11, #0x20]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
"ldr q26, [x9, #0x20]\n"
".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
@@ -1875,34 +1873,34 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 138f\n"
"137:" // Height 4: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x4\n"
"ldr q26, [x12, #0x0]\n"
"ldr q25, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
- ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
"ldr q25, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x0]\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x9, #0x0]\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x9, #0x10]\n"
@@ -1935,10 +1933,10 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q25, [x12, #0x10]\n"
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "add x12, x12, #0x20\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x11, #0x0]\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
"ldr q25, [x11, #0x10]\n"
@@ -1967,17 +1965,17 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 131b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
@@ -1987,9 +1985,9 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 142f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v26.4s }, [x21]\n"
"ld1r { v25.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v26.4s\n"
"fmin v12.4s, v12.4s, v26.4s\n"
@@ -2029,95 +2027,95 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"tbz x14, #3, 146f\n"
"st1 { v7.4s }, [x13], #0x10\n"
"st1 { v12.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
"tbz x14, #2, 144f\n"
"st1 { v13.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
"tbz x14, #1, 143f\n"
"str d14, [x13], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
"tbz x14, #0, 150f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
"b 150f\n"
"143:" // Height 4: Partial direct writeback: partial_1_12
"tbz x14, #0, 150f\n"
"str s14, [x13, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
"b 150f\n"
"144:" // Height 4: Partial direct writeback: partial_2_8
"tbz x14, #1, 145f\n"
"str d13, [x13], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
"tbz x14, #0, 150f\n"
"st1 { v13.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
"b 150f\n"
"145:" // Height 4: Partial direct writeback: partial_1_8
"tbz x14, #0, 150f\n"
"str s13, [x13, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
"b 150f\n"
"146:" // Height 4: Partial direct writeback: partial_4_0
"tbz x14, #2, 148f\n"
"st1 { v7.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
"tbz x14, #1, 147f\n"
"str d12, [x13], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
"tbz x14, #0, 150f\n"
"st1 { v12.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
"b 150f\n"
"147:" // Height 4: Partial direct writeback: partial_1_4
"tbz x14, #0, 150f\n"
"str s12, [x13, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
"b 150f\n"
"148:" // Height 4: Partial direct writeback: partial_2_0
"tbz x14, #1, 149f\n"
"str d7, [x13], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x14, #0, 150f\n"
"st1 { v7.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
"b 150f\n"
"149:" // Height 4: Partial direct writeback: partial_1_0
"str s7, [x13, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
"150:" // Height 4: Partial direct writeback: Done
"b 152f\n"
"151:" // Height 4: Full writeback
@@ -2126,37 +2124,37 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"str q13, [x13, #0x20]\n"
"str q14, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
"152:" // Height 4: Writeback done
"subs x14, x14, #0x10\n"
"bgt 116b\n"
"b 230f\n"
"153:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"154:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 155f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -2169,15 +2167,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cbz x15, 156f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x15, x15, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -2200,128 +2198,128 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"156:" // Height 5: no bias
"tbz %x[flags], #0, 167f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x14, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 165f\n"
"tbz x14, #3, 160f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x14, #2, 158f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v27.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v27.4s }, [x22], #0x10\n"
"tbz x14, #1, 157f\n"
"ldr d16, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d6, [x23], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d6, [x22], #0x8\n"
"tbz x14, #0, 164f\n"
"ld1 { v16.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v6.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v6.s }[2], [x22]\n"
"b 164f\n"
"157:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 164f\n"
"ldr s16, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s6, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s6, [x22, #0x0]\n"
"b 164f\n"
"158:" // Height 5: Partial accumulate: partial_2_8
"tbz x14, #1, 159f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x14, #0, 164f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 164f\n"
"159:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 164f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 164f\n"
"160:" // Height 5: Partial accumulate: partial_4_0
"tbz x14, #2, 162f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x14, #1, 161f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x14, #0, 164f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 164f\n"
"161:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 164f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 164f\n"
"162:" // Height 5: Partial accumulate: partial_2_0
"tbz x14, #1, 163f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x14, #0, 164f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 164f\n"
"163:" // Height 5: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"164:" // Height 5: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 166f\n"
@@ -2330,22 +2328,22 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x13, #0x10]\n"
"ldr q11, [x13, #0x20]\n"
"ldr q16, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q25, [x23, #0x0]\n"
- "ldr q26, [x23, #0x10]\n"
- "ldr q27, [x23, #0x20]\n"
- "ldr q6, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q25, [x22, #0x0]\n"
+ "ldr q26, [x22, #0x10]\n"
+ "ldr q27, [x22, #0x20]\n"
+ "ldr q6, [x22, #0x30]\n"
"166:" // Height 5: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -2401,8 +2399,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"169:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 170f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2439,35 +2437,35 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"172:" // Height 5: Multiply loop: Main loop head
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "sub x27, x27, #0x8\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x12, #0x10]\n"
- ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x11, #0x0]\n"
".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
"ldr q0, [x11, #0x10]\n"
".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x0]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x10]\n"
".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x9, #0x0]\n"
@@ -2485,17 +2483,17 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"ldr q0, [x12, #0x30]\n"
"ldr q4, [x23, #0x0]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
"ldr q6, [x11, #0x20]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n"
".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n"
"ldr q0, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n"
"ldr q6, [x10, #0x20]\n"
@@ -2503,8 +2501,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n"
"ldr q0, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n"
"ldr q6, [x9, #0x20]\n"
@@ -2527,31 +2525,31 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"173:" // Height 5: Multiply loop: Single iteration only
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "sub x27, x27, #0x8\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x12, #0x10]\n"
- ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x11, #0x0]\n"
".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
"ldr q0, [x11, #0x10]\n"
".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x0]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x10]\n"
".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
@@ -2570,8 +2568,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n"
".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"ldr q2, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
"ldr q0, [x11, #0x20]\n"
@@ -2579,8 +2577,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n"
".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n"
"ldr q2, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
"ldr q0, [x10, #0x20]\n"
@@ -2588,8 +2586,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n"
".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n"
"ldr q2, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n"
"ldr q0, [x9, #0x20]\n"
@@ -2609,29 +2607,29 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 176f\n"
"175:" // Height 5: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "cmp x27, #0x4\n"
"ldr d0, [x22], #0x8\n"
"ldr q1, [x12, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v3.2d, v2.2d\n"
- "trn1 v2.2d, v0.2d, v5.2d\n"
- "ldr q0, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
+ "ldr q0, [x12, #0x10]\n"
".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
"ldr q1, [x11, #0x0]\n"
".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ "cmp x27, #0x4\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
"ldr q0, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n"
"ldr q1, [x10, #0x0]\n"
@@ -2639,8 +2637,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n"
"ldr q0, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n"
"ldr q6, [x9, #0x0]\n"
@@ -2648,8 +2646,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n"
"ldr q0, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n"
@@ -2683,12 +2681,12 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"trn1 v3.2d, v3.2d, v4.2d\n"
"trn1 v2.2d, v5.2d, v0.2d\n"
"ldr q1, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
".inst 0x6e46ece8 // bfmmla v8.4s, v7.8h, v6.8h\n"
".inst 0x6e46ec70 // bfmmla v16.4s, v3.8h, v6.8h\n"
".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
"ldr q0, [x11, #0x0]\n"
".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n"
"ldr q1, [x11, #0x10]\n"
@@ -2723,20 +2721,20 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 169b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
@@ -2748,9 +2746,9 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
"tbz %x[flags], #1, 180f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v1.4s\n"
"fmin v12.4s, v12.4s, v1.4s\n"
@@ -2798,111 +2796,111 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"tbz x14, #3, 184f\n"
"st1 { v7.4s }, [x13], #0x10\n"
"st1 { v12.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x14, #2, 182f\n"
"st1 { v13.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x14, #1, 181f\n"
"str d14, [x13], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x14, #0, 188f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 188f\n"
"181:" // Height 5: Partial direct writeback: partial_1_12
"tbz x14, #0, 188f\n"
"str s14, [x13, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 188f\n"
"182:" // Height 5: Partial direct writeback: partial_2_8
"tbz x14, #1, 183f\n"
"str d13, [x13], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x14, #0, 188f\n"
"st1 { v13.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 188f\n"
"183:" // Height 5: Partial direct writeback: partial_1_8
"tbz x14, #0, 188f\n"
"str s13, [x13, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 188f\n"
"184:" // Height 5: Partial direct writeback: partial_4_0
"tbz x14, #2, 186f\n"
"st1 { v7.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x14, #1, 185f\n"
"str d12, [x13], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x14, #0, 188f\n"
"st1 { v12.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 188f\n"
"185:" // Height 5: Partial direct writeback: partial_1_4
"tbz x14, #0, 188f\n"
"str s12, [x13, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 188f\n"
"186:" // Height 5: Partial direct writeback: partial_2_0
"tbz x14, #1, 187f\n"
"str d7, [x13], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x14, #0, 188f\n"
"st1 { v7.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 188f\n"
"187:" // Height 5: Partial direct writeback: partial_1_0
"str s7, [x13, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"188:" // Height 5: Partial direct writeback: Done
"b 190f\n"
"189:" // Height 5: Full writeback
@@ -2911,45 +2909,44 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"str q13, [x13, #0x20]\n"
"str q14, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"190:" // Height 5: Writeback done
"subs x14, x14, #0x10\n"
"bgt 154b\n"
"b 230f\n"
"191:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x18\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x18\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"192:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 193f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -2962,15 +2959,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cbz x15, 194f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x15, x15, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -2993,145 +2990,145 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"194:" // Height 6: no bias
"tbz %x[flags], #0, 205f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x14, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 203f\n"
"tbz x14, #3, 198f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x14, #2, 196f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v27.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x14, #1, 195f\n"
"ldr d16, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d6, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d6, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"ld1 { v16.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v6.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v6.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 202f\n"
"195:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 202f\n"
"ldr s16, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s6, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s6, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 202f\n"
"196:" // Height 6: Partial accumulate: partial_2_8
"tbz x14, #1, 197f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 202f\n"
"197:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 202f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 202f\n"
"198:" // Height 6: Partial accumulate: partial_4_0
"tbz x14, #2, 200f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x14, #1, 199f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 202f\n"
"199:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 202f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 202f\n"
"200:" // Height 6: Partial accumulate: partial_2_0
"tbz x14, #1, 201f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 202f\n"
"201:" // Height 6: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"202:" // Height 6: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 204f\n"
@@ -3140,26 +3137,26 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x13, #0x10]\n"
"ldr q11, [x13, #0x20]\n"
"ldr q16, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q25, [x23, #0x0]\n"
- "ldr q26, [x23, #0x10]\n"
- "ldr q27, [x23, #0x20]\n"
- "ldr q6, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q25, [x22, #0x0]\n"
+ "ldr q26, [x22, #0x10]\n"
+ "ldr q27, [x22, #0x20]\n"
+ "ldr q6, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"204:" // Height 6: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -3215,8 +3212,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"207:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 208f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3257,36 +3254,36 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"210:" // Height 6: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
"cmp x27, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x12, #0x10]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "add x21, x21, #0x10\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x11, #0x0]\n"
".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
"ldr q6, [x11, #0x10]\n"
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x0]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x10]\n"
".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ "add x21, x21, #0x10\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x9, #0x0]\n"
@@ -3304,17 +3301,17 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
"ldr q0, [x12, #0x30]\n"
"ldr q4, [x23, #0x0]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
"ldr q6, [x11, #0x20]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n"
".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n"
"ldr q0, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n"
"ldr q6, [x10, #0x20]\n"
@@ -3322,8 +3319,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n"
"ldr q0, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n"
"ldr q6, [x9, #0x20]\n"
@@ -3347,32 +3344,32 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"211:" // Height 6: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x12, #0x10]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x11, #0x0]\n"
".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
"ldr q6, [x11, #0x10]\n"
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x0]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x10]\n"
".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
@@ -3391,8 +3388,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
"ldr q2, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ "add x12, x12, #0x40\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
"ldr q0, [x11, #0x20]\n"
@@ -3400,8 +3397,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n"
".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n"
"ldr q2, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n"
+ "add x11, x11, #0x40\n"
".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
"ldr q0, [x10, #0x20]\n"
@@ -3409,8 +3406,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n"
".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n"
"ldr q2, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n"
+ "add x10, x10, #0x40\n"
".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n"
"ldr q0, [x9, #0x20]\n"
@@ -3430,25 +3427,25 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 214f\n"
"213:" // Height 6: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d5, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x4\n"
- "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x22], #0x8\n"
"ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
"ldr q1, [x12, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v3.2d\n"
- "trn1 v2.2d, v2.2d, v0.2d\n"
"ldr q0, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
"ldr q1, [x11, #0x0]\n"
".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
"ldr q0, [x11, #0x10]\n"
@@ -3506,19 +3503,19 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q0, [x12, #0x0]\n"
"trn1 v7.2d, v1.2d, v2.2d\n"
"trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n"
"trn1 v2.2d, v5.2d, v6.2d\n"
"ldr q1, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n"
".inst 0x6e40ec70 // bfmmla v16.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec58 // bfmmla v24.4s, v2.8h, v0.8h\n"
"ldr q0, [x11, #0x0]\n"
".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n"
".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n"
"ldr q1, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n"
"ldr q0, [x10, #0x0]\n"
@@ -3526,8 +3523,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
"ldr q1, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n"
"ldr q0, [x9, #0x0]\n"
@@ -3535,8 +3532,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n"
"ldr q6, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n"
".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n"
@@ -3548,21 +3545,21 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 207b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
@@ -3578,9 +3575,9 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
"tbz %x[flags], #1, 218f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v1.4s\n"
"fmin v12.4s, v12.4s, v1.4s\n"
@@ -3636,127 +3633,127 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"tbz x14, #3, 222f\n"
"st1 { v7.4s }, [x13], #0x10\n"
"st1 { v12.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
- "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
"tbz x14, #2, 220f\n"
"st1 { v13.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v29.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
"tbz x14, #1, 219f\n"
"str d14, [x13], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d27, [x22], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
"tbz x14, #0, 226f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v27.s }[2], [x22]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
"b 226f\n"
"219:" // Height 6: Partial direct writeback: partial_1_12
"tbz x14, #0, 226f\n"
"str s14, [x13, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s27, [x22, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
"b 226f\n"
"220:" // Height 6: Partial direct writeback: partial_2_8
"tbz x14, #1, 221f\n"
"str d13, [x13], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d26, [x22], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
"tbz x14, #0, 226f\n"
"st1 { v13.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v26.s }[2], [x22]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
"b 226f\n"
"221:" // Height 6: Partial direct writeback: partial_1_8
"tbz x14, #0, 226f\n"
"str s13, [x13, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s26, [x22, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
"b 226f\n"
"222:" // Height 6: Partial direct writeback: partial_4_0
"tbz x14, #2, 224f\n"
"st1 { v7.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
"tbz x14, #1, 223f\n"
"str d12, [x13], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d28, [x23], #0x8\n"
- "str d25, [x22], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d28, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
"tbz x14, #0, 226f\n"
"st1 { v12.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v28.s }[2], [x23]\n"
- "st1 { v25.s }[2], [x22]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
"b 226f\n"
"223:" // Height 6: Partial direct writeback: partial_1_4
"tbz x14, #0, 226f\n"
"str s12, [x13, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s28, [x23, #0x0]\n"
- "str s25, [x22, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s28, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
"b 226f\n"
"224:" // Height 6: Partial direct writeback: partial_2_0
"tbz x14, #1, 225f\n"
"str d7, [x13], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x14, #0, 226f\n"
"st1 { v7.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
- "st1 { v24.s }[2], [x22]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
"b 226f\n"
"225:" // Height 6: Partial direct writeback: partial_1_0
"str s7, [x13, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
"226:" // Height 6: Partial direct writeback: Done
"b 228f\n"
"227:" // Height 6: Full writeback
@@ -3765,26 +3762,26 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"str q13, [x13, #0x20]\n"
"str q14, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
- "str q23, [x23, #0x0]\n"
- "str q28, [x23, #0x10]\n"
- "str q29, [x23, #0x20]\n"
- "str q30, [x23, #0x30]\n"
- "str q24, [x22, #0x0]\n"
- "str q25, [x22, #0x10]\n"
- "str q26, [x22, #0x20]\n"
- "str q27, [x22, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q28, [x22, #0x10]\n"
+ "str q29, [x22, #0x20]\n"
+ "str q30, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
"228:" // Height 6: Writeback done
"subs x14, x14, #0x10\n"
"bgt 192b\n"
@@ -3800,8 +3797,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"230:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
index 20138ffe7e..f6a7461740 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
@@ -81,7 +81,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 32, 1> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 32, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
index 93e5e051f8..2686b98092 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
@@ -49,19 +49,18 @@ void a64_ffhybrid_fp16_mla_6x32 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -82,7 +81,6 @@ void a64_ffhybrid_fp16_mla_6x32 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -107,19 +105,19 @@ void a64_ffhybrid_fp16_mla_6x32 (
"bgt 101f\n"
"beq 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x18\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x18\n"
"bgt 3f\n"
"cmp x14, #0x10\n"
"mov x9, x12\n"
@@ -266,8 +264,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"24:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 25f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -291,9 +289,6 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q17, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"ldr q16, [x9, #0x0]\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x12, #0x10]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
@@ -344,29 +339,30 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q16, [x9, #0x60]\n"
"fmla v10.8h, v17.8h, v0.h[6]\n"
"ldr q17, [x12, #0x70]\n"
- "add x12, x12, #0x80\n"
"fmla v11.8h, v16.8h, v0.h[6]\n"
"ldr q16, [x11, #0x70]\n"
- "add x11, x11, #0x80\n"
"fmla v8.8h, v17.8h, v0.h[7]\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
"ldr q16, [x9, #0x70]\n"
- "add x9, x9, #0x80\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x10\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
- "ldr q6, [x12, #0x0]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
+ "add x26, x26, #0x10\n"
"ldr q0, [x26, #0x0]\n"
+ "add x12, x12, #0x80\n"
+ "ldr q6, [x12, #0x0]\n"
+ "add x11, x11, #0x80\n"
"ldr q7, [x11, #0x0]\n"
+ "add x10, x10, #0x80\n"
+ "add x9, x9, #0x80\n"
"bge 27b\n"
"28:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"ldr q17, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"ldr q16, [x9, #0x0]\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x12, #0x10]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
@@ -417,35 +413,37 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q16, [x9, #0x60]\n"
"fmla v10.8h, v17.8h, v0.h[6]\n"
"ldr q17, [x12, #0x70]\n"
- "add x12, x12, #0x80\n"
"fmla v11.8h, v16.8h, v0.h[6]\n"
"ldr q16, [x11, #0x70]\n"
- "add x11, x11, #0x80\n"
"fmla v8.8h, v17.8h, v0.h[7]\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
"ldr q16, [x9, #0x70]\n"
- "add x9, x9, #0x80\n"
+ "sub x27, x27, #0x8\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
+ "add x26, x26, #0x10\n"
+ "add x12, x12, #0x80\n"
+ "add x11, x11, #0x80\n"
+ "add x10, x10, #0x80\n"
+ "add x9, x9, #0x80\n"
"29:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 31f\n"
"30:" // Height 1: Multiply loop: Odd block loop
"ldr h0, [x26], #0x2\n"
- "ldr q17, [x12, #0x0]\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v8.8h, v16.8h, v0.h[0]\n"
"sub x27, x27, #0x1\n"
+ "ldr q17, [x11, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v9.8h, v17.8h, v0.h[0]\n"
+ "fmla v10.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
"add x12, x12, #0x10\n"
- "ldr q16, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "fmla v8.8h, v17.8h, v0.h[0]\n"
- "ldr q17, [x10, #0x0]\n"
"add x10, x10, #0x10\n"
- "fmla v9.8h, v16.8h, v0.h[0]\n"
- "ldr q16, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.8h, v17.8h, v0.h[0]\n"
- "fmla v11.8h, v16.8h, v0.h[0]\n"
"cbnz x27, 30b\n"
"31:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -453,9 +451,9 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 24b\n"
"tbz %x[flags], #1, 32f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.8h }, [x21]\n"
"ld1r { v16.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v17.8h\n"
"fmin v9.8h, v9.8h, v17.8h\n"
@@ -574,19 +572,19 @@ void a64_ffhybrid_fp16_mla_6x32 (
"b 302f\n"
"51:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"52:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x18\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x18\n"
"bgt 53f\n"
"cmp x14, #0x10\n"
"mov x9, x12\n"
@@ -599,159 +597,159 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cbz x15, 54f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "add x15, x15, #0x40\n"
"b 73f\n"
"54:" // Height 2: no bias
"tbz %x[flags], #0, 72f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x14, #0x20\n"
- "add x26, x13, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"bge 71f\n"
"tbz x14, #4, 62f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
"ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
"tbz x14, #3, 58f\n"
"ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
"tbz x14, #2, 56f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"tbz x14, #1, 55f\n"
"ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
"tbz x14, #0, 70f\n"
"ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x26]\n"
+ "ld1 { v15.h }[6], [x25]\n"
"b 70f\n"
"55:" // Height 2: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x14, #0, 70f\n"
"ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x26]\n"
+ "ld1 { v15.h }[4], [x25]\n"
"b 70f\n"
"56:" // Height 2: Partial accumulate: partial_2_24
"tbz x14, #1, 57f\n"
"ldr s11, [x13], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
"tbz x14, #0, 70f\n"
"ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x26]\n"
+ "ld1 { v15.h }[2], [x25]\n"
"b 70f\n"
"57:" // Height 2: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x14, #0, 70f\n"
"ldr h11, [x13, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
"b 70f\n"
"58:" // Height 2: Partial accumulate: partial_4_16
"tbz x14, #2, 60f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"tbz x14, #1, 59f\n"
"ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
"tbz x14, #0, 70f\n"
"ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x26]\n"
+ "ld1 { v14.h }[6], [x25]\n"
"b 70f\n"
"59:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x14, #0, 70f\n"
"ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x26]\n"
+ "ld1 { v14.h }[4], [x25]\n"
"b 70f\n"
"60:" // Height 2: Partial accumulate: partial_2_16
"tbz x14, #1, 61f\n"
"ldr s10, [x13], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
"tbz x14, #0, 70f\n"
"ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x26]\n"
+ "ld1 { v14.h }[2], [x25]\n"
"b 70f\n"
"61:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x14, #0, 70f\n"
"ldr h10, [x13, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
"b 70f\n"
"62:" // Height 2: Partial accumulate: partial_8_0
"tbz x14, #3, 66f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
"tbz x14, #2, 64f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"tbz x14, #1, 63f\n"
"ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
"tbz x14, #0, 70f\n"
"ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x26]\n"
+ "ld1 { v13.h }[6], [x25]\n"
"b 70f\n"
"63:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x14, #0, 70f\n"
"ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x26]\n"
+ "ld1 { v13.h }[4], [x25]\n"
"b 70f\n"
"64:" // Height 2: Partial accumulate: partial_2_8
"tbz x14, #1, 65f\n"
"ldr s9, [x13], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
"tbz x14, #0, 70f\n"
"ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x26]\n"
+ "ld1 { v13.h }[2], [x25]\n"
"b 70f\n"
"65:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x14, #0, 70f\n"
"ldr h9, [x13, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
"b 70f\n"
"66:" // Height 2: Partial accumulate: partial_4_0
"tbz x14, #2, 68f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"tbz x14, #1, 67f\n"
"ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
"tbz x14, #0, 70f\n"
"ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x26]\n"
+ "ld1 { v12.h }[6], [x25]\n"
"b 70f\n"
"67:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x14, #0, 70f\n"
"ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x26]\n"
+ "ld1 { v12.h }[4], [x25]\n"
"b 70f\n"
"68:" // Height 2: Partial accumulate: partial_2_0
"tbz x14, #1, 69f\n"
"ldr s8, [x13], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
"tbz x14, #0, 70f\n"
"ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x26]\n"
+ "ld1 { v12.h }[2], [x25]\n"
"b 70f\n"
"69:" // Height 2: Partial accumulate: partial_1_0
"ldr h8, [x13, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"70:" // Height 2: Partial accumulate: Done
"sub x13, x13, x20\n"
@@ -761,10 +759,10 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 73f\n"
"72:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -779,8 +777,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"74:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 75f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -811,15 +809,15 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"ldr q16, [x9, #0x0]\n"
- "add x26, x26, #0x10\n"
"cmp x27, #0x10\n"
- "add x25, x25, #0x10\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"fmla v14.8h, v17.8h, v1.h[0]\n"
"ldr q17, [x12, #0x10]\n"
+ "add x26, x26, #0x10\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
"fmla v15.8h, v16.8h, v1.h[0]\n"
"ldr q16, [x11, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v8.8h, v17.8h, v0.h[1]\n"
"fmla v12.8h, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x10]\n"
@@ -920,10 +918,10 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v13.8h, v7.8h, v1.h[0]\n"
"ldr q16, [x9, #0x0]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"fmla v14.8h, v17.8h, v1.h[0]\n"
"ldr q17, [x12, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
"fmla v15.8h, v16.8h, v1.h[0]\n"
"ldr q16, [x11, #0x10]\n"
@@ -1021,20 +1019,20 @@ void a64_ffhybrid_fp16_mla_6x32 (
"sub x27, x27, #0x1\n"
"ldr q17, [x12, #0x0]\n"
"ldr q16, [x11, #0x0]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
"fmla v8.8h, v17.8h, v1.h[0]\n"
"fmla v12.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
"fmla v9.8h, v16.8h, v1.h[0]\n"
"fmla v13.8h, v16.8h, v0.h[0]\n"
"ldr q16, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
"fmla v10.8h, v17.8h, v1.h[0]\n"
"fmla v14.8h, v17.8h, v0.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v11.8h, v16.8h, v1.h[0]\n"
"fmla v15.8h, v16.8h, v0.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
"cbnz x27, 80b\n"
"81:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1042,11 +1040,11 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 74b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"tbz %x[flags], #1, 82f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.8h }, [x21]\n"
"ld1r { v16.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v17.8h\n"
"fmin v9.8h, v9.8h, v17.8h\n"
@@ -1070,127 +1068,127 @@ void a64_ffhybrid_fp16_mla_6x32 (
"tbz x14, #4, 90f\n"
"st1 { v8.8h }, [x13], #0x10\n"
"st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
"tbz x14, #3, 86f\n"
"st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
"tbz x14, #2, 84f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x14, #1, 83f\n"
"st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x26]\n"
+ "st1 { v15.h }[6], [x25]\n"
"b 98f\n"
"83:" // Height 2: Partial direct writeback: partial_1_28
"tbz x14, #0, 98f\n"
"st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x26]\n"
+ "st1 { v15.h }[4], [x25]\n"
"b 98f\n"
"84:" // Height 2: Partial direct writeback: partial_2_24
"tbz x14, #1, 85f\n"
"str s11, [x13], #0x4\n"
- "str s15, [x26], #0x4\n"
+ "str s15, [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x26]\n"
+ "st1 { v15.h }[2], [x25]\n"
"b 98f\n"
"85:" // Height 2: Partial direct writeback: partial_1_24
"tbz x14, #0, 98f\n"
"str h11, [x13, #0x0]\n"
- "str h15, [x26, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
"b 98f\n"
"86:" // Height 2: Partial direct writeback: partial_4_16
"tbz x14, #2, 88f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x14, #1, 87f\n"
"st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x26]\n"
+ "st1 { v14.h }[6], [x25]\n"
"b 98f\n"
"87:" // Height 2: Partial direct writeback: partial_1_20
"tbz x14, #0, 98f\n"
"st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x26]\n"
+ "st1 { v14.h }[4], [x25]\n"
"b 98f\n"
"88:" // Height 2: Partial direct writeback: partial_2_16
"tbz x14, #1, 89f\n"
"str s10, [x13], #0x4\n"
- "str s14, [x26], #0x4\n"
+ "str s14, [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x26]\n"
+ "st1 { v14.h }[2], [x25]\n"
"b 98f\n"
"89:" // Height 2: Partial direct writeback: partial_1_16
"tbz x14, #0, 98f\n"
"str h10, [x13, #0x0]\n"
- "str h14, [x26, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
"b 98f\n"
"90:" // Height 2: Partial direct writeback: partial_8_0
"tbz x14, #3, 94f\n"
"st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
"tbz x14, #2, 92f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x14, #1, 91f\n"
"st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x26]\n"
+ "st1 { v13.h }[6], [x25]\n"
"b 98f\n"
"91:" // Height 2: Partial direct writeback: partial_1_12
"tbz x14, #0, 98f\n"
"st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x26]\n"
+ "st1 { v13.h }[4], [x25]\n"
"b 98f\n"
"92:" // Height 2: Partial direct writeback: partial_2_8
"tbz x14, #1, 93f\n"
"str s9, [x13], #0x4\n"
- "str s13, [x26], #0x4\n"
+ "str s13, [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x26]\n"
+ "st1 { v13.h }[2], [x25]\n"
"b 98f\n"
"93:" // Height 2: Partial direct writeback: partial_1_8
"tbz x14, #0, 98f\n"
"str h9, [x13, #0x0]\n"
- "str h13, [x26, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
"b 98f\n"
"94:" // Height 2: Partial direct writeback: partial_4_0
"tbz x14, #2, 96f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x14, #1, 95f\n"
"st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x26]\n"
+ "st1 { v12.h }[6], [x25]\n"
"b 98f\n"
"95:" // Height 2: Partial direct writeback: partial_1_4
"tbz x14, #0, 98f\n"
"st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x26]\n"
+ "st1 { v12.h }[4], [x25]\n"
"b 98f\n"
"96:" // Height 2: Partial direct writeback: partial_2_0
"tbz x14, #1, 97f\n"
"str s8, [x13], #0x4\n"
- "str s12, [x26], #0x4\n"
+ "str s12, [x25], #0x4\n"
"tbz x14, #0, 98f\n"
"st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x26]\n"
+ "st1 { v12.h }[2], [x25]\n"
"b 98f\n"
"97:" // Height 2: Partial direct writeback: partial_1_0
"str h8, [x13, #0x0]\n"
- "str h12, [x26, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
"98:" // Height 2: Partial direct writeback: Done
"b 100f\n"
"99:" // Height 2: Full writeback
@@ -1199,29 +1197,29 @@ void a64_ffhybrid_fp16_mla_6x32 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"100:" // Height 2: Writeback done
"subs x14, x14, #0x20\n"
"bgt 52b\n"
"b 302f\n"
"101:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"102:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x18\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x18\n"
"bgt 103f\n"
"cmp x14, #0x10\n"
"mov x9, x12\n"
@@ -1234,197 +1232,197 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cbz x15, 104f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"b 123f\n"
"104:" // Height 3: no bias
"tbz %x[flags], #0, 122f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #1\n"
"cmp x14, #0x20\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"bge 121f\n"
"tbz x14, #4, 112f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
"ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
"tbz x14, #3, 108f\n"
"ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
"tbz x14, #2, 106f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x14, #1, 105f\n"
"ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
"b 120f\n"
"105:" // Height 3: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x14, #0, 120f\n"
"ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
"b 120f\n"
"106:" // Height 3: Partial accumulate: partial_2_24
"tbz x14, #1, 107f\n"
"ldr s11, [x13], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
"b 120f\n"
"107:" // Height 3: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x14, #0, 120f\n"
"ldr h11, [x13, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
"b 120f\n"
"108:" // Height 3: Partial accumulate: partial_4_16
"tbz x14, #2, 110f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x14, #1, 109f\n"
"ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
"b 120f\n"
"109:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x14, #0, 120f\n"
"ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
"b 120f\n"
"110:" // Height 3: Partial accumulate: partial_2_16
"tbz x14, #1, 111f\n"
"ldr s10, [x13], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
"b 120f\n"
"111:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x14, #0, 120f\n"
"ldr h10, [x13, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
"b 120f\n"
"112:" // Height 3: Partial accumulate: partial_8_0
"tbz x14, #3, 116f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
"tbz x14, #2, 114f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x14, #1, 113f\n"
"ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
"b 120f\n"
"113:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x14, #0, 120f\n"
"ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
"b 120f\n"
"114:" // Height 3: Partial accumulate: partial_2_8
"tbz x14, #1, 115f\n"
"ldr s9, [x13], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
"b 120f\n"
"115:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x14, #0, 120f\n"
"ldr h9, [x13, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
"b 120f\n"
"116:" // Height 3: Partial accumulate: partial_4_0
"tbz x14, #2, 118f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x14, #1, 117f\n"
"ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
"b 120f\n"
"117:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x14, #0, 120f\n"
"ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
"b 120f\n"
"118:" // Height 3: Partial accumulate: partial_2_0
"tbz x14, #1, 119f\n"
"ldr s8, [x13], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
"tbz x14, #0, 120f\n"
"ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
"b 120f\n"
"119:" // Height 3: Partial accumulate: partial_1_0
"ldr h8, [x13, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
"120:" // Height 3: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 123f\n"
@@ -1433,14 +1431,14 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 123f\n"
"122:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -1459,8 +1457,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"124:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 125f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1491,18 +1489,18 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"ldr q21, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"ldr q20, [x9, #0x0]\n"
"add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
"fmla v10.8h, v21.8h, v0.h[0]\n"
"fmla v14.8h, v21.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
"fmla v18.8h, v21.8h, v2.h[0]\n"
"ldr q21, [x12, #0x10]\n"
"fmla v11.8h, v20.8h, v0.h[0]\n"
@@ -1601,8 +1599,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v14.8h, v21.8h, v1.h[6]\n"
"fmla v18.8h, v21.8h, v2.h[6]\n"
"ldr q21, [x12, #0x70]\n"
- "add x12, x12, #0x80\n"
"fmla v11.8h, v20.8h, v0.h[6]\n"
+ "add x12, x12, #0x80\n"
"fmla v15.8h, v20.8h, v1.h[6]\n"
"fmla v19.8h, v20.8h, v2.h[6]\n"
"ldr q20, [x11, #0x70]\n"
@@ -1611,8 +1609,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v12.8h, v21.8h, v1.h[7]\n"
"fmla v16.8h, v21.8h, v2.h[7]\n"
"ldr q21, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
"fmla v9.8h, v20.8h, v0.h[7]\n"
+ "add x10, x10, #0x80\n"
"fmla v13.8h, v20.8h, v1.h[7]\n"
"fmla v17.8h, v20.8h, v2.h[7]\n"
"ldr q20, [x9, #0x70]\n"
@@ -1742,8 +1740,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v14.8h, v21.8h, v1.h[6]\n"
"fmla v18.8h, v21.8h, v2.h[6]\n"
"ldr q21, [x12, #0x70]\n"
- "add x12, x12, #0x80\n"
"fmla v11.8h, v20.8h, v0.h[6]\n"
+ "add x12, x12, #0x80\n"
"fmla v15.8h, v20.8h, v1.h[6]\n"
"fmla v19.8h, v20.8h, v2.h[6]\n"
"ldr q20, [x11, #0x70]\n"
@@ -1752,8 +1750,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v12.8h, v21.8h, v1.h[7]\n"
"fmla v16.8h, v21.8h, v2.h[7]\n"
"ldr q21, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
"fmla v9.8h, v20.8h, v0.h[7]\n"
+ "add x10, x10, #0x80\n"
"fmla v13.8h, v20.8h, v1.h[7]\n"
"fmla v17.8h, v20.8h, v2.h[7]\n"
"ldr q20, [x9, #0x70]\n"
@@ -1772,23 +1770,23 @@ void a64_ffhybrid_fp16_mla_6x32 (
"sub x27, x27, #0x1\n"
"ldr h0, [x24], #0x2\n"
"ldr q21, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
- "ldr q20, [x11, #0x0]\n"
- "add x11, x11, #0x10\n"
"fmla v8.8h, v21.8h, v2.h[0]\n"
"fmla v12.8h, v21.8h, v1.h[0]\n"
+ "ldr q20, [x11, #0x0]\n"
"fmla v16.8h, v21.8h, v0.h[0]\n"
"ldr q21, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
"fmla v9.8h, v20.8h, v2.h[0]\n"
"fmla v13.8h, v20.8h, v1.h[0]\n"
"fmla v17.8h, v20.8h, v0.h[0]\n"
"ldr q20, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v10.8h, v21.8h, v2.h[0]\n"
"fmla v14.8h, v21.8h, v1.h[0]\n"
+ "add x11, x11, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v18.8h, v21.8h, v0.h[0]\n"
"fmla v11.8h, v20.8h, v2.h[0]\n"
+ "add x9, x9, #0x10\n"
"fmla v15.8h, v20.8h, v1.h[0]\n"
"fmla v19.8h, v20.8h, v0.h[0]\n"
"cbnz x27, 130b\n"
@@ -1798,12 +1796,12 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 124b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 132f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v21.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.8h }, [x21]\n"
"ld1r { v20.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v21.8h\n"
"fmin v9.8h, v9.8h, v21.8h\n"
@@ -1835,159 +1833,159 @@ void a64_ffhybrid_fp16_mla_6x32 (
"tbz x14, #4, 140f\n"
"st1 { v8.8h }, [x13], #0x10\n"
"st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
"tbz x14, #3, 136f\n"
"st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
"tbz x14, #2, 134f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x14, #1, 133f\n"
"st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
"b 148f\n"
"133:" // Height 3: Partial direct writeback: partial_1_28
"tbz x14, #0, 148f\n"
"st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
"b 148f\n"
"134:" // Height 3: Partial direct writeback: partial_2_24
"tbz x14, #1, 135f\n"
"str s11, [x13], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
"b 148f\n"
"135:" // Height 3: Partial direct writeback: partial_1_24
"tbz x14, #0, 148f\n"
"str h11, [x13, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
"b 148f\n"
"136:" // Height 3: Partial direct writeback: partial_4_16
"tbz x14, #2, 138f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x14, #1, 137f\n"
"st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
"b 148f\n"
"137:" // Height 3: Partial direct writeback: partial_1_20
"tbz x14, #0, 148f\n"
"st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
"b 148f\n"
"138:" // Height 3: Partial direct writeback: partial_2_16
"tbz x14, #1, 139f\n"
"str s10, [x13], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
"b 148f\n"
"139:" // Height 3: Partial direct writeback: partial_1_16
"tbz x14, #0, 148f\n"
"str h10, [x13, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
"b 148f\n"
"140:" // Height 3: Partial direct writeback: partial_8_0
"tbz x14, #3, 144f\n"
"st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
"tbz x14, #2, 142f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x14, #1, 141f\n"
"st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
"b 148f\n"
"141:" // Height 3: Partial direct writeback: partial_1_12
"tbz x14, #0, 148f\n"
"st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
"b 148f\n"
"142:" // Height 3: Partial direct writeback: partial_2_8
"tbz x14, #1, 143f\n"
"str s9, [x13], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
"b 148f\n"
"143:" // Height 3: Partial direct writeback: partial_1_8
"tbz x14, #0, 148f\n"
"str h9, [x13, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
"b 148f\n"
"144:" // Height 3: Partial direct writeback: partial_4_0
"tbz x14, #2, 146f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x14, #1, 145f\n"
"st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
"b 148f\n"
"145:" // Height 3: Partial direct writeback: partial_1_4
"tbz x14, #0, 148f\n"
"st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
"b 148f\n"
"146:" // Height 3: Partial direct writeback: partial_2_0
"tbz x14, #1, 147f\n"
"str s8, [x13], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
"tbz x14, #0, 148f\n"
"st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
"b 148f\n"
"147:" // Height 3: Partial direct writeback: partial_1_0
"str h8, [x13, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
"148:" // Height 3: Partial direct writeback: Done
"b 150f\n"
"149:" // Height 3: Full writeback
@@ -1996,33 +1994,33 @@ void a64_ffhybrid_fp16_mla_6x32 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"150:" // Height 3: Writeback done
"subs x14, x14, #0x20\n"
"bgt 102b\n"
"b 302f\n"
"151:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"152:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x18\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x18\n"
"bgt 153f\n"
"cmp x14, #0x10\n"
"mov x9, x12\n"
@@ -2035,18 +2033,18 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cbz x15, 154f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -2054,215 +2052,215 @@ void a64_ffhybrid_fp16_mla_6x32 (
"154:" // Height 4: no bias
"tbz %x[flags], #0, 172f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x20\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "cmp x14, #0x20\n"
+ "add x23, x24, x20, LSL #1\n"
"bge 171f\n"
"tbz x14, #4, 162f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
"ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
"tbz x14, #3, 158f\n"
"ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
"tbz x14, #2, 156f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x14, #1, 155f\n"
"ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
"b 170f\n"
"155:" // Height 4: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x14, #0, 170f\n"
"ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
"b 170f\n"
"156:" // Height 4: Partial accumulate: partial_2_24
"tbz x14, #1, 157f\n"
"ldr s11, [x13], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
"b 170f\n"
"157:" // Height 4: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x14, #0, 170f\n"
"ldr h11, [x13, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
"b 170f\n"
"158:" // Height 4: Partial accumulate: partial_4_16
"tbz x14, #2, 160f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x14, #1, 159f\n"
"ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
"b 170f\n"
"159:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x14, #0, 170f\n"
"ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
"b 170f\n"
"160:" // Height 4: Partial accumulate: partial_2_16
"tbz x14, #1, 161f\n"
"ldr s10, [x13], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
"b 170f\n"
"161:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x14, #0, 170f\n"
"ldr h10, [x13, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
"b 170f\n"
"162:" // Height 4: Partial accumulate: partial_8_0
"tbz x14, #3, 166f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
"tbz x14, #2, 164f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x14, #1, 163f\n"
"ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
"b 170f\n"
"163:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x14, #0, 170f\n"
"ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
"b 170f\n"
"164:" // Height 4: Partial accumulate: partial_2_8
"tbz x14, #1, 165f\n"
"ldr s9, [x13], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
"b 170f\n"
"165:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x14, #0, 170f\n"
"ldr h9, [x13, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
"b 170f\n"
"166:" // Height 4: Partial accumulate: partial_4_0
"tbz x14, #2, 168f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x14, #1, 167f\n"
"ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
"b 170f\n"
"167:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x14, #0, 170f\n"
"ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
"b 170f\n"
"168:" // Height 4: Partial accumulate: partial_2_0
"tbz x14, #1, 169f\n"
"ldr s8, [x13], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
"tbz x14, #0, 170f\n"
"ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
"b 170f\n"
"169:" // Height 4: Partial accumulate: partial_1_0
"ldr h8, [x13, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
"170:" // Height 4: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 173f\n"
@@ -2271,18 +2269,18 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 173f\n"
"172:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -2305,8 +2303,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"174:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 175f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2341,11 +2339,11 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
"ldr q25, [x10, #0x0]\n"
- "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"add x25, x25, #0x10\n"
@@ -2691,16 +2689,16 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr h0, [x23], #0x2\n"
"ldr q25, [x12, #0x0]\n"
"ldr q24, [x11, #0x0]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
"fmla v8.8h, v25.8h, v3.h[0]\n"
"fmla v12.8h, v25.8h, v2.h[0]\n"
"fmla v16.8h, v25.8h, v1.h[0]\n"
"fmla v20.8h, v25.8h, v0.h[0]\n"
"ldr q25, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v9.8h, v24.8h, v3.h[0]\n"
"fmla v13.8h, v24.8h, v2.h[0]\n"
+ "add x11, x11, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v17.8h, v24.8h, v1.h[0]\n"
"fmla v21.8h, v24.8h, v0.h[0]\n"
"ldr q24, [x9, #0x0]\n"
@@ -2720,13 +2718,13 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 174b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 182f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.8h }, [x21]\n"
"ld1r { v24.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v25.8h\n"
"fmin v9.8h, v9.8h, v25.8h\n"
@@ -2766,191 +2764,191 @@ void a64_ffhybrid_fp16_mla_6x32 (
"tbz x14, #4, 190f\n"
"st1 { v8.8h }, [x13], #0x10\n"
"st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
"tbz x14, #3, 186f\n"
"st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
"tbz x14, #2, 184f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x14, #1, 183f\n"
"st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
"b 198f\n"
"183:" // Height 4: Partial direct writeback: partial_1_28
"tbz x14, #0, 198f\n"
"st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
"b 198f\n"
"184:" // Height 4: Partial direct writeback: partial_2_24
"tbz x14, #1, 185f\n"
"str s11, [x13], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
"b 198f\n"
"185:" // Height 4: Partial direct writeback: partial_1_24
"tbz x14, #0, 198f\n"
"str h11, [x13, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
"b 198f\n"
"186:" // Height 4: Partial direct writeback: partial_4_16
"tbz x14, #2, 188f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x14, #1, 187f\n"
"st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
"b 198f\n"
"187:" // Height 4: Partial direct writeback: partial_1_20
"tbz x14, #0, 198f\n"
"st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
"b 198f\n"
"188:" // Height 4: Partial direct writeback: partial_2_16
"tbz x14, #1, 189f\n"
"str s10, [x13], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
"b 198f\n"
"189:" // Height 4: Partial direct writeback: partial_1_16
"tbz x14, #0, 198f\n"
"str h10, [x13, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
"b 198f\n"
"190:" // Height 4: Partial direct writeback: partial_8_0
"tbz x14, #3, 194f\n"
"st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
"tbz x14, #2, 192f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x14, #1, 191f\n"
"st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
"b 198f\n"
"191:" // Height 4: Partial direct writeback: partial_1_12
"tbz x14, #0, 198f\n"
"st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
"b 198f\n"
"192:" // Height 4: Partial direct writeback: partial_2_8
"tbz x14, #1, 193f\n"
"str s9, [x13], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
"b 198f\n"
"193:" // Height 4: Partial direct writeback: partial_1_8
"tbz x14, #0, 198f\n"
"str h9, [x13, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
"b 198f\n"
"194:" // Height 4: Partial direct writeback: partial_4_0
"tbz x14, #2, 196f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x14, #1, 195f\n"
"st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
"b 198f\n"
"195:" // Height 4: Partial direct writeback: partial_1_4
"tbz x14, #0, 198f\n"
"st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
"b 198f\n"
"196:" // Height 4: Partial direct writeback: partial_2_0
"tbz x14, #1, 197f\n"
"str s8, [x13], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x14, #0, 198f\n"
"st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
"b 198f\n"
"197:" // Height 4: Partial direct writeback: partial_1_0
"str h8, [x13, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
"198:" // Height 4: Partial direct writeback: Done
"b 200f\n"
"199:" // Height 4: Full writeback
@@ -2959,37 +2957,37 @@ void a64_ffhybrid_fp16_mla_6x32 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"200:" // Height 4: Writeback done
"subs x14, x14, #0x20\n"
"bgt 152b\n"
"b 302f\n"
"201:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"202:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x18\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x18\n"
"bgt 203f\n"
"cmp x14, #0x10\n"
"mov x9, x12\n"
@@ -3002,18 +3000,18 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cbz x15, 204f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -3025,248 +3023,248 @@ void a64_ffhybrid_fp16_mla_6x32 (
"204:" // Height 5: no bias
"tbz %x[flags], #0, 222f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x20\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "cmp x14, #0x20\n"
+ "add x22, x23, x20, LSL #1\n"
"bge 221f\n"
"tbz x14, #4, 212f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
"ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
"tbz x14, #3, 208f\n"
"ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
"tbz x14, #2, 206f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x14, #1, 205f\n"
"ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
"b 220f\n"
"205:" // Height 5: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x14, #0, 220f\n"
"ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
"b 220f\n"
"206:" // Height 5: Partial accumulate: partial_2_24
"tbz x14, #1, 207f\n"
"ldr s11, [x13], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
"b 220f\n"
"207:" // Height 5: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x14, #0, 220f\n"
"ldr h11, [x13, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
"b 220f\n"
"208:" // Height 5: Partial accumulate: partial_4_16
"tbz x14, #2, 210f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x14, #1, 209f\n"
"ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
"b 220f\n"
"209:" // Height 5: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x14, #0, 220f\n"
"ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
"b 220f\n"
"210:" // Height 5: Partial accumulate: partial_2_16
"tbz x14, #1, 211f\n"
"ldr s10, [x13], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
"b 220f\n"
"211:" // Height 5: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x14, #0, 220f\n"
"ldr h10, [x13, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
"b 220f\n"
"212:" // Height 5: Partial accumulate: partial_8_0
"tbz x14, #3, 216f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
"tbz x14, #2, 214f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x14, #1, 213f\n"
"ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
"b 220f\n"
"213:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x14, #0, 220f\n"
"ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
"b 220f\n"
"214:" // Height 5: Partial accumulate: partial_2_8
"tbz x14, #1, 215f\n"
"ldr s9, [x13], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
"b 220f\n"
"215:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x14, #0, 220f\n"
"ldr h9, [x13, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
"b 220f\n"
"216:" // Height 5: Partial accumulate: partial_4_0
"tbz x14, #2, 218f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x14, #1, 217f\n"
"ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
"b 220f\n"
"217:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x14, #0, 220f\n"
"ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
"b 220f\n"
"218:" // Height 5: Partial accumulate: partial_2_0
"tbz x14, #1, 219f\n"
"ldr s8, [x13], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
"tbz x14, #0, 220f\n"
"ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
"b 220f\n"
"219:" // Height 5: Partial accumulate: partial_1_0
"ldr h8, [x13, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
"220:" // Height 5: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 223f\n"
@@ -3275,22 +3273,22 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 223f\n"
"222:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -3317,8 +3315,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"224:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 225f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3357,10 +3355,10 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
"ldr q29, [x10, #0x0]\n"
@@ -3774,19 +3772,19 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr h1, [x23], #0x2\n"
"ldr h0, [x22], #0x2\n"
"ldr q29, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
- "ldr q28, [x11, #0x0]\n"
- "add x11, x11, #0x10\n"
"fmla v8.8h, v29.8h, v4.h[0]\n"
"fmla v12.8h, v29.8h, v3.h[0]\n"
+ "ldr q28, [x11, #0x0]\n"
"fmla v16.8h, v29.8h, v2.h[0]\n"
"fmla v20.8h, v29.8h, v1.h[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v24.8h, v29.8h, v0.h[0]\n"
"ldr q29, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
"fmla v9.8h, v28.8h, v4.h[0]\n"
+ "add x11, x11, #0x10\n"
"fmla v13.8h, v28.8h, v3.h[0]\n"
"fmla v17.8h, v28.8h, v2.h[0]\n"
+ "add x10, x10, #0x10\n"
"fmla v21.8h, v28.8h, v1.h[0]\n"
"fmla v25.8h, v28.8h, v0.h[0]\n"
"ldr q28, [x9, #0x0]\n"
@@ -3808,14 +3806,14 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 224b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 232f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v29.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.8h }, [x21]\n"
"ld1r { v28.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v29.8h\n"
"fmin v9.8h, v9.8h, v29.8h\n"
@@ -3863,223 +3861,223 @@ void a64_ffhybrid_fp16_mla_6x32 (
"tbz x14, #4, 240f\n"
"st1 { v8.8h }, [x13], #0x10\n"
"st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
"tbz x14, #3, 236f\n"
"st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
"tbz x14, #2, 234f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x14, #1, 233f\n"
"st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
- "st1 { v27.h }[6], [x23]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
"b 248f\n"
"233:" // Height 5: Partial direct writeback: partial_1_28
"tbz x14, #0, 248f\n"
"st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
- "st1 { v27.h }[4], [x23]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
"b 248f\n"
"234:" // Height 5: Partial direct writeback: partial_2_24
"tbz x14, #1, 235f\n"
"str s11, [x13], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
- "str s27, [x23], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
- "st1 { v27.h }[2], [x23]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
"b 248f\n"
"235:" // Height 5: Partial direct writeback: partial_1_24
"tbz x14, #0, 248f\n"
"str h11, [x13, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
- "str h27, [x23, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
"b 248f\n"
"236:" // Height 5: Partial direct writeback: partial_4_16
"tbz x14, #2, 238f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x14, #1, 237f\n"
"st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
- "st1 { v26.h }[6], [x23]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
"b 248f\n"
"237:" // Height 5: Partial direct writeback: partial_1_20
"tbz x14, #0, 248f\n"
"st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
- "st1 { v26.h }[4], [x23]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
"b 248f\n"
"238:" // Height 5: Partial direct writeback: partial_2_16
"tbz x14, #1, 239f\n"
"str s10, [x13], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
- "str s26, [x23], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
- "st1 { v26.h }[2], [x23]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
"b 248f\n"
"239:" // Height 5: Partial direct writeback: partial_1_16
"tbz x14, #0, 248f\n"
"str h10, [x13, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
- "str h26, [x23, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
"b 248f\n"
"240:" // Height 5: Partial direct writeback: partial_8_0
"tbz x14, #3, 244f\n"
"st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
"tbz x14, #2, 242f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x14, #1, 241f\n"
"st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
- "st1 { v25.h }[6], [x23]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
"b 248f\n"
"241:" // Height 5: Partial direct writeback: partial_1_12
"tbz x14, #0, 248f\n"
"st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
- "st1 { v25.h }[4], [x23]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
"b 248f\n"
"242:" // Height 5: Partial direct writeback: partial_2_8
"tbz x14, #1, 243f\n"
"str s9, [x13], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
- "str s25, [x23], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
- "st1 { v25.h }[2], [x23]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
"b 248f\n"
"243:" // Height 5: Partial direct writeback: partial_1_8
"tbz x14, #0, 248f\n"
"str h9, [x13, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
- "str h25, [x23, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
"b 248f\n"
"244:" // Height 5: Partial direct writeback: partial_4_0
"tbz x14, #2, 246f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x14, #1, 245f\n"
"st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
- "st1 { v24.h }[6], [x23]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
"b 248f\n"
"245:" // Height 5: Partial direct writeback: partial_1_4
"tbz x14, #0, 248f\n"
"st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
- "st1 { v24.h }[4], [x23]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
"b 248f\n"
"246:" // Height 5: Partial direct writeback: partial_2_0
"tbz x14, #1, 247f\n"
"str s8, [x13], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x14, #0, 248f\n"
"st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
- "st1 { v24.h }[2], [x23]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
"b 248f\n"
"247:" // Height 5: Partial direct writeback: partial_1_0
"str h8, [x13, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
- "str h24, [x23, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
"248:" // Height 5: Partial direct writeback: Done
"b 250f\n"
"249:" // Height 5: Full writeback
@@ -4088,45 +4086,44 @@ void a64_ffhybrid_fp16_mla_6x32 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"250:" // Height 5: Writeback done
"subs x14, x14, #0x20\n"
"bgt 202b\n"
"b 302f\n"
"251:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0xc\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0xc\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"252:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x18\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x18\n"
"bgt 253f\n"
"cmp x14, #0x10\n"
"mov x9, x12\n"
@@ -4139,18 +4136,18 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cbz x15, 254f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -4166,281 +4163,281 @@ void a64_ffhybrid_fp16_mla_6x32 (
"254:" // Height 6: no bias
"tbz %x[flags], #0, 272f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x20\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "cmp x14, #0x20\n"
+ "add x21, x22, x20, LSL #1\n"
"bge 271f\n"
"tbz x14, #4, 262f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x22], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
"ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
- "ld1 { v29.8h }, [x22], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
+ "ld1 { v29.8h }, [x21], #0x10\n"
"tbz x14, #3, 258f\n"
"ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
- "ld1 { v30.8h }, [x22], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
+ "ld1 { v30.8h }, [x21], #0x10\n"
"tbz x14, #2, 256f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x14, #1, 255f\n"
"ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "ld1 { v31.h }[6], [x22]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
+ "ld1 { v31.h }[6], [x21]\n"
"b 270f\n"
"255:" // Height 6: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x14, #0, 270f\n"
"ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "ld1 { v31.h }[4], [x22]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
+ "ld1 { v31.h }[4], [x21]\n"
"b 270f\n"
"256:" // Height 6: Partial accumulate: partial_2_24
"tbz x14, #1, 257f\n"
"ldr s11, [x13], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "ld1 { v31.h }[2], [x22]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
+ "ld1 { v31.h }[2], [x21]\n"
"b 270f\n"
"257:" // Height 6: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x14, #0, 270f\n"
"ldr h11, [x13, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "ldr h31, [x22, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
+ "ldr h31, [x21, #0x0]\n"
"b 270f\n"
"258:" // Height 6: Partial accumulate: partial_4_16
"tbz x14, #2, 260f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x14, #1, 259f\n"
"ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v30.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
+ "ld1 { v30.s }[2], [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
- "ld1 { v30.h }[6], [x22]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
+ "ld1 { v30.h }[6], [x21]\n"
"b 270f\n"
"259:" // Height 6: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x14, #0, 270f\n"
"ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
- "ld1 { v30.h }[4], [x22]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
+ "ld1 { v30.h }[4], [x21]\n"
"b 270f\n"
"260:" // Height 6: Partial accumulate: partial_2_16
"tbz x14, #1, 261f\n"
"ldr s10, [x13], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s30, [x22], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "ldr s30, [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
- "ld1 { v30.h }[2], [x22]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "ld1 { v30.h }[2], [x21]\n"
"b 270f\n"
"261:" // Height 6: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x14, #0, 270f\n"
"ldr h10, [x13, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
- "ldr h30, [x22, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "ldr h30, [x21, #0x0]\n"
"b 270f\n"
"262:" // Height 6: Partial accumulate: partial_8_0
"tbz x14, #3, 266f\n"
"ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x22], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
"tbz x14, #2, 264f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x14, #1, 263f\n"
"ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
- "ld1 { v29.s }[2], [x22], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v29.s }[2], [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
- "ld1 { v29.h }[6], [x22]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v29.h }[6], [x21]\n"
"b 270f\n"
"263:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x14, #0, 270f\n"
"ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
- "ld1 { v29.h }[4], [x22]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v29.h }[4], [x21]\n"
"b 270f\n"
"264:" // Height 6: Partial accumulate: partial_2_8
"tbz x14, #1, 265f\n"
"ldr s9, [x13], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s29, [x22], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s29, [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v29.h }[2], [x22]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v29.h }[2], [x21]\n"
"b 270f\n"
"265:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x14, #0, 270f\n"
"ldr h9, [x13, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h29, [x22, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h29, [x21, #0x0]\n"
"b 270f\n"
"266:" // Height 6: Partial accumulate: partial_4_0
"tbz x14, #2, 268f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x14, #1, 267f\n"
"ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v28.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v28.h }[6], [x21]\n"
"b 270f\n"
"267:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x14, #0, 270f\n"
"ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v28.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v28.h }[4], [x21]\n"
"b 270f\n"
"268:" // Height 6: Partial accumulate: partial_2_0
"tbz x14, #1, 269f\n"
"ldr s8, [x13], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
"tbz x14, #0, 270f\n"
"ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v28.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v28.h }[2], [x21]\n"
"b 270f\n"
"269:" // Height 6: Partial accumulate: partial_1_0
"ldr h8, [x13, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h28, [x22, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h28, [x21, #0x0]\n"
"270:" // Height 6: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 273f\n"
@@ -4449,26 +4446,26 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 273f\n"
"272:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -4499,8 +4496,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"274:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 275f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -4543,10 +4540,10 @@ void a64_ffhybrid_fp16_mla_6x32 (
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
"fmla v28.8h, v6.8h, v5.h[0]\n"
@@ -5029,12 +5026,12 @@ void a64_ffhybrid_fp16_mla_6x32 (
"ldr h2, [x21], #0x2\n"
"ldr q1, [x12, #0x0]\n"
"ldr q0, [x11, #0x0]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
"fmla v8.8h, v1.8h, v7.h[0]\n"
"fmla v12.8h, v1.8h, v6.h[0]\n"
"fmla v16.8h, v1.8h, v5.h[0]\n"
"fmla v20.8h, v1.8h, v4.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v24.8h, v1.8h, v3.h[0]\n"
"fmla v28.8h, v1.8h, v2.h[0]\n"
"ldr q1, [x10, #0x0]\n"
@@ -5066,15 +5063,15 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 274b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"tbz %x[flags], #1, 282f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.8h }, [x21]\n"
"ld1r { v0.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v1.8h\n"
"fmin v9.8h, v9.8h, v1.8h\n"
@@ -5130,255 +5127,255 @@ void a64_ffhybrid_fp16_mla_6x32 (
"tbz x14, #4, 290f\n"
"st1 { v8.8h }, [x13], #0x10\n"
"st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x22], #0x10\n"
- "st1 { v29.8h }, [x22], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "st1 { v29.8h }, [x21], #0x10\n"
"tbz x14, #3, 286f\n"
"st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
- "st1 { v30.8h }, [x22], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
+ "st1 { v30.8h }, [x21], #0x10\n"
"tbz x14, #2, 284f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x14, #1, 283f\n"
"st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
- "st1 { v31.s }[2], [x22], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
+ "st1 { v31.s }[2], [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
- "st1 { v27.h }[6], [x23]\n"
- "st1 { v31.h }[6], [x22]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
+ "st1 { v31.h }[6], [x21]\n"
"b 298f\n"
"283:" // Height 6: Partial direct writeback: partial_1_28
"tbz x14, #0, 298f\n"
"st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
- "st1 { v27.h }[4], [x23]\n"
- "st1 { v31.h }[4], [x22]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
+ "st1 { v31.h }[4], [x21]\n"
"b 298f\n"
"284:" // Height 6: Partial direct writeback: partial_2_24
"tbz x14, #1, 285f\n"
"str s11, [x13], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
- "str s27, [x23], #0x4\n"
- "str s31, [x22], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
+ "str s31, [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
- "st1 { v27.h }[2], [x23]\n"
- "st1 { v31.h }[2], [x22]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "st1 { v31.h }[2], [x21]\n"
"b 298f\n"
"285:" // Height 6: Partial direct writeback: partial_1_24
"tbz x14, #0, 298f\n"
"str h11, [x13, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
- "str h27, [x23, #0x0]\n"
- "str h31, [x22, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
+ "str h31, [x21, #0x0]\n"
"b 298f\n"
"286:" // Height 6: Partial direct writeback: partial_4_16
"tbz x14, #2, 288f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x14, #1, 287f\n"
"st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
- "st1 { v30.s }[2], [x22], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
- "st1 { v26.h }[6], [x23]\n"
- "st1 { v30.h }[6], [x22]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
"b 298f\n"
"287:" // Height 6: Partial direct writeback: partial_1_20
"tbz x14, #0, 298f\n"
"st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
- "st1 { v26.h }[4], [x23]\n"
- "st1 { v30.h }[4], [x22]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
"b 298f\n"
"288:" // Height 6: Partial direct writeback: partial_2_16
"tbz x14, #1, 289f\n"
"str s10, [x13], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
- "str s26, [x23], #0x4\n"
- "str s30, [x22], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
+ "str s30, [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
- "st1 { v26.h }[2], [x23]\n"
- "st1 { v30.h }[2], [x22]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
"b 298f\n"
"289:" // Height 6: Partial direct writeback: partial_1_16
"tbz x14, #0, 298f\n"
"str h10, [x13, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
- "str h26, [x23, #0x0]\n"
- "str h30, [x22, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
+ "str h30, [x21, #0x0]\n"
"b 298f\n"
"290:" // Height 6: Partial direct writeback: partial_8_0
"tbz x14, #3, 294f\n"
"st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x22], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
"tbz x14, #2, 292f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x14, #1, 291f\n"
"st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
- "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
+ "st1 { v29.s }[2], [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
- "st1 { v25.h }[6], [x23]\n"
- "st1 { v29.h }[6], [x22]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "st1 { v29.h }[6], [x21]\n"
"b 298f\n"
"291:" // Height 6: Partial direct writeback: partial_1_12
"tbz x14, #0, 298f\n"
"st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
- "st1 { v25.h }[4], [x23]\n"
- "st1 { v29.h }[4], [x22]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "st1 { v29.h }[4], [x21]\n"
"b 298f\n"
"292:" // Height 6: Partial direct writeback: partial_2_8
"tbz x14, #1, 293f\n"
"str s9, [x13], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
- "str s25, [x23], #0x4\n"
- "str s29, [x22], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
+ "str s29, [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
- "st1 { v25.h }[2], [x23]\n"
- "st1 { v29.h }[2], [x22]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "st1 { v29.h }[2], [x21]\n"
"b 298f\n"
"293:" // Height 6: Partial direct writeback: partial_1_8
"tbz x14, #0, 298f\n"
"str h9, [x13, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
- "str h25, [x23, #0x0]\n"
- "str h29, [x22, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
+ "str h29, [x21, #0x0]\n"
"b 298f\n"
"294:" // Height 6: Partial direct writeback: partial_4_0
"tbz x14, #2, 296f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x14, #1, 295f\n"
"st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
- "st1 { v24.h }[6], [x23]\n"
- "st1 { v28.h }[6], [x22]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
+ "st1 { v28.h }[6], [x21]\n"
"b 298f\n"
"295:" // Height 6: Partial direct writeback: partial_1_4
"tbz x14, #0, 298f\n"
"st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
- "st1 { v24.h }[4], [x23]\n"
- "st1 { v28.h }[4], [x22]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
+ "st1 { v28.h }[4], [x21]\n"
"b 298f\n"
"296:" // Height 6: Partial direct writeback: partial_2_0
"tbz x14, #1, 297f\n"
"str s8, [x13], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x14, #0, 298f\n"
"st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
- "st1 { v24.h }[2], [x23]\n"
- "st1 { v28.h }[2], [x22]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
+ "st1 { v28.h }[2], [x21]\n"
"b 298f\n"
"297:" // Height 6: Partial direct writeback: partial_1_0
"str h8, [x13, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
- "str h24, [x23, #0x0]\n"
- "str h28, [x22, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
+ "str h28, [x21, #0x0]\n"
"298:" // Height 6: Partial direct writeback: Done
"b 300f\n"
"299:" // Height 6: Full writeback
@@ -5387,26 +5384,26 @@ void a64_ffhybrid_fp16_mla_6x32 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"300:" // Height 6: Writeback done
"subs x14, x14, #0x20\n"
"bgt 252b\n"
@@ -5422,8 +5419,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"302:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
index 658850e12c..8f2c7fdec5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
@@ -81,7 +81,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 1> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
index 5dcaa9e5d5..c6c5b047ca 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
@@ -49,19 +49,18 @@ void a64_ffhybrid_fp32_mla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -82,7 +81,6 @@ void a64_ffhybrid_fp32_mla_6x16 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -107,19 +105,19 @@ void a64_ffhybrid_fp32_mla_6x16 (
"bgt 69f\n"
"beq 35f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #2\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 3f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -210,8 +208,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"16:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -235,9 +233,6 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q17, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x9, #0x0]\n"
- "sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x8\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"ldr q17, [x12, #0x10]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
@@ -256,29 +251,30 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q16, [x9, #0x20]\n"
"fmla v10.4s, v17.4s, v0.s[2]\n"
"ldr q17, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"fmla v11.4s, v16.4s, v0.s[2]\n"
"ldr q16, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
"fmla v8.4s, v17.4s, v0.s[3]\n"
"ldr q17, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x8\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
- "ldr q6, [x12, #0x0]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
+ "add x26, x26, #0x10\n"
"ldr q0, [x26, #0x0]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q6, [x12, #0x0]\n"
+ "add x11, x11, #0x40\n"
"ldr q7, [x11, #0x0]\n"
+ "add x10, x10, #0x40\n"
+ "add x9, x9, #0x40\n"
"bge 19b\n"
"20:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"ldr q17, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x9, #0x0]\n"
- "sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"ldr q17, [x12, #0x10]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
@@ -297,35 +293,37 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q16, [x9, #0x20]\n"
"fmla v10.4s, v17.4s, v0.s[2]\n"
"ldr q17, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"fmla v11.4s, v16.4s, v0.s[2]\n"
"ldr q16, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
"fmla v8.4s, v17.4s, v0.s[3]\n"
"ldr q17, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
+ "sub x27, x27, #0x4\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
+ "add x26, x26, #0x10\n"
+ "add x12, x12, #0x40\n"
+ "add x11, x11, #0x40\n"
+ "add x10, x10, #0x40\n"
+ "add x9, x9, #0x40\n"
"21:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 23f\n"
"22:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x26], #0x4\n"
- "ldr q17, [x12, #0x0]\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v8.4s, v16.4s, v18.s[0]\n"
"sub x27, x27, #0x1\n"
+ "ldr q17, [x11, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v9.4s, v17.4s, v18.s[0]\n"
+ "fmla v10.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v11.4s, v16.4s, v18.s[0]\n"
"add x12, x12, #0x10\n"
- "ldr q16, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "fmla v8.4s, v17.4s, v18.s[0]\n"
- "ldr q17, [x10, #0x0]\n"
"add x10, x10, #0x10\n"
- "fmla v9.4s, v16.4s, v18.s[0]\n"
- "ldr q16, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.4s, v17.4s, v18.s[0]\n"
- "fmla v11.4s, v16.4s, v18.s[0]\n"
"cbnz x27, 22b\n"
"23:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -333,9 +331,9 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 16b\n"
"tbz %x[flags], #1, 24f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -406,19 +404,19 @@ void a64_ffhybrid_fp32_mla_6x16 (
"b 206f\n"
"35:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"36:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #2\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 37f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -431,87 +429,87 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cbz x15, 38f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "add x15, x15, #0x40\n"
"b 49f\n"
"38:" // Height 2: no bias
"tbz %x[flags], #0, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"bge 47f\n"
"tbz x14, #3, 42f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x14, #2, 40f\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x14, #1, 39f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
"tbz x14, #0, 46f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 46f\n"
"39:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 46f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 46f\n"
"40:" // Height 2: Partial accumulate: partial_2_8
"tbz x14, #1, 41f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
"tbz x14, #0, 46f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 46f\n"
"41:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 46f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 46f\n"
"42:" // Height 2: Partial accumulate: partial_4_0
"tbz x14, #2, 44f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x14, #1, 43f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
"tbz x14, #0, 46f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 46f\n"
"43:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 46f\n"
"ldr s9, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 46f\n"
"44:" // Height 2: Partial accumulate: partial_2_0
"tbz x14, #1, 45f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
"tbz x14, #0, 46f\n"
"ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 46f\n"
"45:" // Height 2: Partial accumulate: partial_1_0
"ldr s8, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"46:" // Height 2: Partial accumulate: Done
"sub x13, x13, x20\n"
@@ -521,10 +519,10 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 49f\n"
"48:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -539,8 +537,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"50:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 51f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -571,15 +569,15 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"ldr q16, [x9, #0x0]\n"
- "add x26, x26, #0x10\n"
"cmp x27, #0x8\n"
- "add x25, x25, #0x10\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"fmla v14.4s, v17.4s, v1.s[0]\n"
"ldr q17, [x12, #0x10]\n"
+ "add x26, x26, #0x10\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
"fmla v15.4s, v16.4s, v1.s[0]\n"
"ldr q16, [x11, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"fmla v12.4s, v17.4s, v1.s[1]\n"
"ldr q17, [x10, #0x10]\n"
@@ -632,10 +630,10 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v13.4s, v7.4s, v1.s[0]\n"
"ldr q16, [x9, #0x0]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"fmla v14.4s, v17.4s, v1.s[0]\n"
"ldr q17, [x12, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
"fmla v15.4s, v16.4s, v1.s[0]\n"
"ldr q16, [x11, #0x10]\n"
@@ -685,20 +683,20 @@ void a64_ffhybrid_fp32_mla_6x16 (
"sub x27, x27, #0x1\n"
"ldr q17, [x12, #0x0]\n"
"ldr q16, [x11, #0x0]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
"fmla v8.4s, v17.4s, v19.s[0]\n"
"fmla v12.4s, v17.4s, v18.s[0]\n"
"ldr q17, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
"fmla v9.4s, v16.4s, v19.s[0]\n"
"fmla v13.4s, v16.4s, v18.s[0]\n"
"ldr q16, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
"fmla v10.4s, v17.4s, v19.s[0]\n"
"fmla v14.4s, v17.4s, v18.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v11.4s, v16.4s, v19.s[0]\n"
"fmla v15.4s, v16.4s, v18.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
"cbnz x27, 56b\n"
"57:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -706,11 +704,11 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 50b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 58f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -734,63 +732,63 @@ void a64_ffhybrid_fp32_mla_6x16 (
"tbz x14, #3, 62f\n"
"st1 { v8.4s }, [x13], #0x10\n"
"st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
"tbz x14, #2, 60f\n"
"st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
"tbz x14, #1, 59f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x14, #0, 66f\n"
"st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x26]\n"
+ "st1 { v15.s }[2], [x25]\n"
"b 66f\n"
"59:" // Height 2: Partial direct writeback: partial_1_12
"tbz x14, #0, 66f\n"
"str s11, [x13, #0x0]\n"
- "str s15, [x26, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
"b 66f\n"
"60:" // Height 2: Partial direct writeback: partial_2_8
"tbz x14, #1, 61f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x14, #0, 66f\n"
"st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x26]\n"
+ "st1 { v14.s }[2], [x25]\n"
"b 66f\n"
"61:" // Height 2: Partial direct writeback: partial_1_8
"tbz x14, #0, 66f\n"
"str s10, [x13, #0x0]\n"
- "str s14, [x26, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
"b 66f\n"
"62:" // Height 2: Partial direct writeback: partial_4_0
"tbz x14, #2, 64f\n"
"st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
"tbz x14, #1, 63f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x14, #0, 66f\n"
"st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x26]\n"
+ "st1 { v13.s }[2], [x25]\n"
"b 66f\n"
"63:" // Height 2: Partial direct writeback: partial_1_4
"tbz x14, #0, 66f\n"
"str s9, [x13, #0x0]\n"
- "str s13, [x26, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
"b 66f\n"
"64:" // Height 2: Partial direct writeback: partial_2_0
"tbz x14, #1, 65f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x14, #0, 66f\n"
"st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x26]\n"
+ "st1 { v12.s }[2], [x25]\n"
"b 66f\n"
"65:" // Height 2: Partial direct writeback: partial_1_0
"str s8, [x13, #0x0]\n"
- "str s12, [x26, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
"66:" // Height 2: Partial direct writeback: Done
"b 68f\n"
"67:" // Height 2: Full writeback
@@ -799,29 +797,29 @@ void a64_ffhybrid_fp32_mla_6x16 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"68:" // Height 2: Writeback done
"subs x14, x14, #0x10\n"
"bgt 36b\n"
"b 206f\n"
"69:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"70:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #2\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 71f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -834,109 +832,109 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cbz x15, 72f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"b 83f\n"
"72:" // Height 3: no bias
"tbz %x[flags], #0, 82f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
"cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 81f\n"
"tbz x14, #3, 76f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x14, #2, 74f\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x14, #1, 73f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x14, #0, 80f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 80f\n"
"73:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 80f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 80f\n"
"74:" // Height 3: Partial accumulate: partial_2_8
"tbz x14, #1, 75f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x14, #0, 80f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 80f\n"
"75:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 80f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 80f\n"
"76:" // Height 3: Partial accumulate: partial_4_0
"tbz x14, #2, 78f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"tbz x14, #1, 77f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x14, #0, 80f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 80f\n"
"77:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 80f\n"
"ldr s9, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"b 80f\n"
"78:" // Height 3: Partial accumulate: partial_2_0
"tbz x14, #1, 79f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x14, #0, 80f\n"
"ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
"b 80f\n"
"79:" // Height 3: Partial accumulate: partial_1_0
"ldr s8, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
"80:" // Height 3: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 83f\n"
@@ -945,14 +943,14 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 83f\n"
"82:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -971,8 +969,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"84:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 85f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1003,18 +1001,18 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
"sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x8\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"ldr q21, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"ldr q20, [x9, #0x0]\n"
"add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
"fmla v10.4s, v21.4s, v0.s[0]\n"
"fmla v14.4s, v21.4s, v1.s[0]\n"
+ "add x24, x24, #0x10\n"
"fmla v18.4s, v21.4s, v2.s[0]\n"
"ldr q21, [x12, #0x10]\n"
"fmla v11.4s, v20.4s, v0.s[0]\n"
@@ -1049,8 +1047,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v14.4s, v21.4s, v1.s[2]\n"
"fmla v18.4s, v21.4s, v2.s[2]\n"
"ldr q21, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"fmla v11.4s, v20.4s, v0.s[2]\n"
+ "add x12, x12, #0x40\n"
"fmla v15.4s, v20.4s, v1.s[2]\n"
"fmla v19.4s, v20.4s, v2.s[2]\n"
"ldr q20, [x11, #0x30]\n"
@@ -1059,8 +1057,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v12.4s, v21.4s, v1.s[3]\n"
"fmla v16.4s, v21.4s, v2.s[3]\n"
"ldr q21, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
"fmla v9.4s, v20.4s, v0.s[3]\n"
+ "add x10, x10, #0x40\n"
"fmla v13.4s, v20.4s, v1.s[3]\n"
"fmla v17.4s, v20.4s, v2.s[3]\n"
"ldr q20, [x9, #0x30]\n"
@@ -1126,8 +1124,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v14.4s, v21.4s, v1.s[2]\n"
"fmla v18.4s, v21.4s, v2.s[2]\n"
"ldr q21, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"fmla v11.4s, v20.4s, v0.s[2]\n"
+ "add x12, x12, #0x40\n"
"fmla v15.4s, v20.4s, v1.s[2]\n"
"fmla v19.4s, v20.4s, v2.s[2]\n"
"ldr q20, [x11, #0x30]\n"
@@ -1136,8 +1134,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v12.4s, v21.4s, v1.s[3]\n"
"fmla v16.4s, v21.4s, v2.s[3]\n"
"ldr q21, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
"fmla v9.4s, v20.4s, v0.s[3]\n"
+ "add x10, x10, #0x40\n"
"fmla v13.4s, v20.4s, v1.s[3]\n"
"fmla v17.4s, v20.4s, v2.s[3]\n"
"ldr q20, [x9, #0x30]\n"
@@ -1156,23 +1154,23 @@ void a64_ffhybrid_fp32_mla_6x16 (
"sub x27, x27, #0x1\n"
"ldr s22, [x24], #0x4\n"
"ldr q21, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
- "ldr q20, [x11, #0x0]\n"
- "add x11, x11, #0x10\n"
"fmla v8.4s, v21.4s, v24.s[0]\n"
"fmla v12.4s, v21.4s, v23.s[0]\n"
+ "ldr q20, [x11, #0x0]\n"
"fmla v16.4s, v21.4s, v22.s[0]\n"
"ldr q21, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
"fmla v9.4s, v20.4s, v24.s[0]\n"
"fmla v13.4s, v20.4s, v23.s[0]\n"
"fmla v17.4s, v20.4s, v22.s[0]\n"
"ldr q20, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v10.4s, v21.4s, v24.s[0]\n"
"fmla v14.4s, v21.4s, v23.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v18.4s, v21.4s, v22.s[0]\n"
"fmla v11.4s, v20.4s, v24.s[0]\n"
+ "add x9, x9, #0x10\n"
"fmla v15.4s, v20.4s, v23.s[0]\n"
"fmla v19.4s, v20.4s, v22.s[0]\n"
"cbnz x27, 90b\n"
@@ -1182,12 +1180,12 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 84b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 92f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.4s }, [x21]\n"
"ld1r { v20.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v21.4s\n"
"fmin v9.4s, v9.4s, v21.4s\n"
@@ -1219,79 +1217,79 @@ void a64_ffhybrid_fp32_mla_6x16 (
"tbz x14, #3, 96f\n"
"st1 { v8.4s }, [x13], #0x10\n"
"st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x14, #2, 94f\n"
"st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x14, #1, 93f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x14, #0, 100f\n"
"st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 100f\n"
"93:" // Height 3: Partial direct writeback: partial_1_12
"tbz x14, #0, 100f\n"
"str s11, [x13, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 100f\n"
"94:" // Height 3: Partial direct writeback: partial_2_8
"tbz x14, #1, 95f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x14, #0, 100f\n"
"st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 100f\n"
"95:" // Height 3: Partial direct writeback: partial_1_8
"tbz x14, #0, 100f\n"
"str s10, [x13, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 100f\n"
"96:" // Height 3: Partial direct writeback: partial_4_0
"tbz x14, #2, 98f\n"
"st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x14, #1, 97f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x14, #0, 100f\n"
"st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 100f\n"
"97:" // Height 3: Partial direct writeback: partial_1_4
"tbz x14, #0, 100f\n"
"str s9, [x13, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 100f\n"
"98:" // Height 3: Partial direct writeback: partial_2_0
"tbz x14, #1, 99f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x14, #0, 100f\n"
"st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 100f\n"
"99:" // Height 3: Partial direct writeback: partial_1_0
"str s8, [x13, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"100:" // Height 3: Partial direct writeback: Done
"b 102f\n"
"101:" // Height 3: Full writeback
@@ -1300,33 +1298,33 @@ void a64_ffhybrid_fp32_mla_6x16 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"102:" // Height 3: Writeback done
"subs x14, x14, #0x10\n"
"bgt 70b\n"
"b 206f\n"
"103:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"104:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #2\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 105f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -1339,18 +1337,18 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cbz x15, 106f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1358,111 +1356,111 @@ void a64_ffhybrid_fp32_mla_6x16 (
"106:" // Height 4: no bias
"tbz %x[flags], #0, 116f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x14, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 115f\n"
"tbz x14, #3, 110f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x14, #2, 108f\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x14, #1, 107f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x14, #0, 114f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 114f\n"
"107:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 114f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 114f\n"
"108:" // Height 4: Partial accumulate: partial_2_8
"tbz x14, #1, 109f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x14, #0, 114f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 114f\n"
"109:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 114f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 114f\n"
"110:" // Height 4: Partial accumulate: partial_4_0
"tbz x14, #2, 112f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x14, #1, 111f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x14, #0, 114f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 114f\n"
"111:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 114f\n"
"ldr s9, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 114f\n"
"112:" // Height 4: Partial accumulate: partial_2_0
"tbz x14, #1, 113f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x14, #0, 114f\n"
"ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 114f\n"
"113:" // Height 4: Partial accumulate: partial_1_0
"ldr s8, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"114:" // Height 4: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 117f\n"
@@ -1471,18 +1469,18 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 117f\n"
"116:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -1505,8 +1503,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"118:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 119f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1541,11 +1539,11 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
"sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x8\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
"ldr q25, [x10, #0x0]\n"
- "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"add x25, x25, #0x10\n"
@@ -1731,16 +1729,16 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr s26, [x23], #0x4\n"
"ldr q25, [x12, #0x0]\n"
"ldr q24, [x11, #0x0]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
"fmla v8.4s, v25.4s, v29.s[0]\n"
"fmla v12.4s, v25.4s, v28.s[0]\n"
"fmla v16.4s, v25.4s, v27.s[0]\n"
"fmla v20.4s, v25.4s, v26.s[0]\n"
"ldr q25, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v9.4s, v24.4s, v29.s[0]\n"
"fmla v13.4s, v24.4s, v28.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v17.4s, v24.4s, v27.s[0]\n"
"fmla v21.4s, v24.4s, v26.s[0]\n"
"ldr q24, [x9, #0x0]\n"
@@ -1760,13 +1758,13 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 118b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 126f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v25.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.4s }, [x21]\n"
"ld1r { v24.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v25.4s\n"
"fmin v9.4s, v9.4s, v25.4s\n"
@@ -1806,95 +1804,95 @@ void a64_ffhybrid_fp32_mla_6x16 (
"tbz x14, #3, 130f\n"
"st1 { v8.4s }, [x13], #0x10\n"
"st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
"tbz x14, #2, 128f\n"
"st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
"tbz x14, #1, 127f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x14, #0, 134f\n"
"st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
"b 134f\n"
"127:" // Height 4: Partial direct writeback: partial_1_12
"tbz x14, #0, 134f\n"
"str s11, [x13, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
"b 134f\n"
"128:" // Height 4: Partial direct writeback: partial_2_8
"tbz x14, #1, 129f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x14, #0, 134f\n"
"st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
"b 134f\n"
"129:" // Height 4: Partial direct writeback: partial_1_8
"tbz x14, #0, 134f\n"
"str s10, [x13, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
"b 134f\n"
"130:" // Height 4: Partial direct writeback: partial_4_0
"tbz x14, #2, 132f\n"
"st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
"tbz x14, #1, 131f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x14, #0, 134f\n"
"st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
"b 134f\n"
"131:" // Height 4: Partial direct writeback: partial_1_4
"tbz x14, #0, 134f\n"
"str s9, [x13, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
"b 134f\n"
"132:" // Height 4: Partial direct writeback: partial_2_0
"tbz x14, #1, 133f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x14, #0, 134f\n"
"st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
"b 134f\n"
"133:" // Height 4: Partial direct writeback: partial_1_0
"str s8, [x13, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
"134:" // Height 4: Partial direct writeback: Done
"b 136f\n"
"135:" // Height 4: Full writeback
@@ -1903,37 +1901,37 @@ void a64_ffhybrid_fp32_mla_6x16 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"136:" // Height 4: Writeback done
"subs x14, x14, #0x10\n"
"bgt 104b\n"
"b 206f\n"
"137:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"138:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #2\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 139f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -1946,18 +1944,18 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cbz x15, 140f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1969,128 +1967,128 @@ void a64_ffhybrid_fp32_mla_6x16 (
"140:" // Height 5: no bias
"tbz %x[flags], #0, 150f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x14, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 149f\n"
"tbz x14, #3, 144f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x14, #2, 142f\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x14, #1, 141f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 148f\n"
"141:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 148f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 148f\n"
"142:" // Height 5: Partial accumulate: partial_2_8
"tbz x14, #1, 143f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 148f\n"
"143:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 148f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 148f\n"
"144:" // Height 5: Partial accumulate: partial_4_0
"tbz x14, #2, 146f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x14, #1, 145f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 148f\n"
"145:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 148f\n"
"ldr s9, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 148f\n"
"146:" // Height 5: Partial accumulate: partial_2_0
"tbz x14, #1, 147f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 148f\n"
"147:" // Height 5: Partial accumulate: partial_1_0
"ldr s8, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"148:" // Height 5: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 151f\n"
@@ -2099,22 +2097,22 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 151f\n"
"150:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -2141,8 +2139,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"152:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 153f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2181,10 +2179,10 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
"sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x8\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
"ldr q29, [x10, #0x0]\n"
@@ -2406,19 +2404,19 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr s31, [x23], #0x4\n"
"ldr s30, [x22], #0x4\n"
"ldr q29, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
- "ldr q28, [x11, #0x0]\n"
- "add x11, x11, #0x10\n"
"fmla v8.4s, v29.4s, v2.s[0]\n"
"fmla v12.4s, v29.4s, v1.s[0]\n"
+ "ldr q28, [x11, #0x0]\n"
"fmla v16.4s, v29.4s, v0.s[0]\n"
"fmla v20.4s, v29.4s, v31.s[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v24.4s, v29.4s, v30.s[0]\n"
"ldr q29, [x10, #0x0]\n"
- "add x10, x10, #0x10\n"
"fmla v9.4s, v28.4s, v2.s[0]\n"
+ "add x11, x11, #0x10\n"
"fmla v13.4s, v28.4s, v1.s[0]\n"
"fmla v17.4s, v28.4s, v0.s[0]\n"
+ "add x10, x10, #0x10\n"
"fmla v21.4s, v28.4s, v31.s[0]\n"
"fmla v25.4s, v28.4s, v30.s[0]\n"
"ldr q28, [x9, #0x0]\n"
@@ -2440,14 +2438,14 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 152b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 160f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v29.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.4s }, [x21]\n"
"ld1r { v28.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v29.4s\n"
"fmin v9.4s, v9.4s, v29.4s\n"
@@ -2495,111 +2493,111 @@ void a64_ffhybrid_fp32_mla_6x16 (
"tbz x14, #3, 164f\n"
"st1 { v8.4s }, [x13], #0x10\n"
"st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x14, #2, 162f\n"
"st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x14, #1, 161f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x14, #0, 168f\n"
"st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 168f\n"
"161:" // Height 5: Partial direct writeback: partial_1_12
"tbz x14, #0, 168f\n"
"str s11, [x13, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 168f\n"
"162:" // Height 5: Partial direct writeback: partial_2_8
"tbz x14, #1, 163f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x14, #0, 168f\n"
"st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 168f\n"
"163:" // Height 5: Partial direct writeback: partial_1_8
"tbz x14, #0, 168f\n"
"str s10, [x13, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 168f\n"
"164:" // Height 5: Partial direct writeback: partial_4_0
"tbz x14, #2, 166f\n"
"st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x14, #1, 165f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x14, #0, 168f\n"
"st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 168f\n"
"165:" // Height 5: Partial direct writeback: partial_1_4
"tbz x14, #0, 168f\n"
"str s9, [x13, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 168f\n"
"166:" // Height 5: Partial direct writeback: partial_2_0
"tbz x14, #1, 167f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x14, #0, 168f\n"
"st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 168f\n"
"167:" // Height 5: Partial direct writeback: partial_1_0
"str s8, [x13, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"168:" // Height 5: Partial direct writeback: Done
"b 170f\n"
"169:" // Height 5: Full writeback
@@ -2608,45 +2606,44 @@ void a64_ffhybrid_fp32_mla_6x16 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"170:" // Height 5: Writeback done
"subs x14, x14, #0x10\n"
"bgt 138b\n"
"b 206f\n"
"171:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x18\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x18\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"172:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0xc\n"
"add x11, x12, x20, LSL #2\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0xc\n"
"bgt 173f\n"
"cmp x14, #0x8\n"
"mov x9, x12\n"
@@ -2659,18 +2656,18 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cbz x15, 174f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x15, x15, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -2686,145 +2683,145 @@ void a64_ffhybrid_fp32_mla_6x16 (
"174:" // Height 6: no bias
"tbz %x[flags], #0, 184f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x10\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x14, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 183f\n"
"tbz x14, #3, 178f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x14, #2, 176f\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x14, #1, 175f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x14, #0, 182f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 182f\n"
"175:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 182f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 182f\n"
"176:" // Height 6: Partial accumulate: partial_2_8
"tbz x14, #1, 177f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x14, #0, 182f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 182f\n"
"177:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 182f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 182f\n"
"178:" // Height 6: Partial accumulate: partial_4_0
"tbz x14, #2, 180f\n"
"ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x14, #1, 179f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x14, #0, 182f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 182f\n"
"179:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 182f\n"
"ldr s9, [x13, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 182f\n"
"180:" // Height 6: Partial accumulate: partial_2_0
"tbz x14, #1, 181f\n"
"ldr d8, [x13], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x14, #0, 182f\n"
"ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 182f\n"
"181:" // Height 6: Partial accumulate: partial_1_0
"ldr s8, [x13, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"182:" // Height 6: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 185f\n"
@@ -2833,26 +2830,26 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr q9, [x13, #0x10]\n"
"ldr q10, [x13, #0x20]\n"
"ldr q11, [x13, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 185f\n"
"184:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -2883,8 +2880,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"186:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 187f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2927,10 +2924,10 @@ void a64_ffhybrid_fp32_mla_6x16 (
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
"sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
+ "cmp x27, #0x8\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
"fmla v28.4s, v6.4s, v5.s[0]\n"
@@ -3189,12 +3186,12 @@ void a64_ffhybrid_fp32_mla_6x16 (
"ldr s2, [x21], #0x4\n"
"ldr q1, [x12, #0x0]\n"
"ldr q0, [x11, #0x0]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
"fmla v8.4s, v1.4s, v7.s[0]\n"
"fmla v12.4s, v1.4s, v6.s[0]\n"
"fmla v16.4s, v1.4s, v5.s[0]\n"
"fmla v20.4s, v1.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v24.4s, v1.4s, v3.s[0]\n"
"fmla v28.4s, v1.4s, v2.s[0]\n"
"ldr q1, [x10, #0x0]\n"
@@ -3226,15 +3223,15 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 186b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 194f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v1.4s\n"
"fmin v9.4s, v9.4s, v1.4s\n"
@@ -3290,127 +3287,127 @@ void a64_ffhybrid_fp32_mla_6x16 (
"tbz x14, #3, 198f\n"
"st1 { v8.4s }, [x13], #0x10\n"
"st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
"tbz x14, #2, 196f\n"
"st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
"tbz x14, #1, 195f\n"
"str d11, [x13], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 202f\n"
"195:" // Height 6: Partial direct writeback: partial_1_12
"tbz x14, #0, 202f\n"
"str s11, [x13, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"b 202f\n"
"196:" // Height 6: Partial direct writeback: partial_2_8
"tbz x14, #1, 197f\n"
"str d10, [x13], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
"b 202f\n"
"197:" // Height 6: Partial direct writeback: partial_1_8
"tbz x14, #0, 202f\n"
"str s10, [x13, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
"b 202f\n"
"198:" // Height 6: Partial direct writeback: partial_4_0
"tbz x14, #2, 200f\n"
"st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
"tbz x14, #1, 199f\n"
"str d9, [x13], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x22]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
"b 202f\n"
"199:" // Height 6: Partial direct writeback: partial_1_4
"tbz x14, #0, 202f\n"
"str s9, [x13, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x22, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
"b 202f\n"
"200:" // Height 6: Partial direct writeback: partial_2_0
"tbz x14, #1, 201f\n"
"str d8, [x13], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x14, #0, 202f\n"
"st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x22]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
"b 202f\n"
"201:" // Height 6: Partial direct writeback: partial_1_0
"str s8, [x13, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x22, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
"202:" // Height 6: Partial direct writeback: Done
"b 204f\n"
"203:" // Height 6: Full writeback
@@ -3419,26 +3416,26 @@ void a64_ffhybrid_fp32_mla_6x16 (
"str q10, [x13, #0x20]\n"
"str q11, [x13, #0x30]\n"
"add x13, x13, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"204:" // Height 6: Writeback done
"subs x14, x14, #0x10\n"
"bgt 172b\n"
@@ -3454,8 +3451,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
index 73c096ca00..ac3cbf943f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -82,14 +82,16 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 24, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 4, 24, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
+ case CPUModel::V1:
+ return { 23.64 };
default:
- return { 28.48 };
+ return { 16.89 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
index 1f7804453c..88547ef3b3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -50,19 +50,18 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -83,7 +82,6 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -105,14 +103,13 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"bgt 89f\n"
"beq 45f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x14\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
@@ -120,6 +117,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x14\n"
"bgt 3f\n"
"cmp x14, #0x10\n"
"mov x27, x12\n"
@@ -138,19 +136,19 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"cbz x15, 4f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "ldr q12, [x15, #0x40]\n"
- "ldr q13, [x15, #0x50]\n"
- "add x15, x15, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x15, #0x40]\n"
+ "ldr q13, [x15, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x15, x15, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -284,8 +282,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"21:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -308,32 +306,28 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"blt 25f\n"
"24:" // Height 1: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
- "cmp x25, #0x8\n"
- "add x11, x11, #0x20\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"ldr q24, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
"ldr q23, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"ldr q22, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"ldr q21, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
"ldr q24, [x28, #0x0]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
"ldr q23, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
"ldr q22, [x27, #0x0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x8\n"
".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
+ "add x12, x12, #0x20\n"
"ldr q4, [x12, #0x0]\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x12, #0x10]\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
@@ -341,36 +335,40 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"ld1 { v0.4s }, [x24], #0x10\n"
"ldr q7, [x11, #0x10]\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "add x28, x28, #0x20\n"
+ "add x27, x27, #0x20\n"
"bge 24b\n"
"25:" // Height 1: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q23, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q22, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"ldr q21, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"ldr q24, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
- ".inst 0x6e57ec0a // bfmmla v10.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec0a // bfmmla v10.4s, v0.8h, v22.8h\n"
"ldr q23, [x28, #0x0]\n"
- ".inst 0x6e56ec10 // bfmmla v16.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e59ec10 // bfmmla v16.4s, v0.8h, v25.8h\n"
"ldr q22, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
"ldr q21, [x27, #0x0]\n"
".inst 0x6e58ec11 // bfmmla v17.4s, v0.8h, v24.8h\n"
"ldr q3, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
+ "sub x25, x25, #0x4\n"
".inst 0x6e57ec0c // bfmmla v12.4s, v0.8h, v23.8h\n"
".inst 0x6e56ec12 // bfmmla v18.4s, v0.8h, v22.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e55ec0d // bfmmla v13.4s, v0.8h, v21.8h\n"
".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "add x28, x28, #0x20\n"
+ "add x27, x27, #0x20\n"
"26:" // Height 1: Multiply loop: Main loop skip
"cbz x25, 29f\n"
"cbz x25, 29f\n"
@@ -382,37 +380,37 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"27:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr s0, [x24, #0x0]\n"
"28:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q23, [x12, #0x0]\n"
- "ldr q29, [x12, #0x10]\n"
+ "ldr q21, [x12, #0x0]\n"
+ "ldr q30, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "add x12, x12, #0x20\n"
- "ldr q22, [x11, #0x0]\n"
- "ldr q21, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
- ".inst 0x6e57ec08 // bfmmla v8.4s, v0.8h, v23.8h\n"
- "ldr q24, [x10, #0x0]\n"
- ".inst 0x6e5dec0e // bfmmla v14.4s, v0.8h, v29.8h\n"
+ ".inst 0x6e55ec08 // bfmmla v8.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x11, #0x0]\n"
+ "ldr q22, [x11, #0x10]\n"
+ ".inst 0x6e5eec0e // bfmmla v14.4s, v0.8h, v30.8h\n"
+ ".inst 0x6e55ec09 // bfmmla v9.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x10, #0x0]\n"
"ldr q23, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
- ".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n"
- "ldr q22, [x9, #0x0]\n"
- ".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n"
- "ldr q21, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
- ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
- "ldr q24, [x28, #0x0]\n"
+ ".inst 0x6e56ec0f // bfmmla v15.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0a // bfmmla v10.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x9, #0x0]\n"
+ "ldr q22, [x9, #0x10]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x0]\n"
"ldr q23, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e56ec11 // bfmmla v17.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0c // bfmmla v12.4s, v0.8h, v21.8h\n"
"ldr q22, [x27, #0x0]\n"
- ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "add x28, x28, #0x20\n"
+ "add x27, x27, #0x20\n"
"29:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -425,9 +423,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v12.2d, v12.2d, v18.2d\n"
"uzp1 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 30f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v22.4s }, [x21]\n"
"ld1r { v21.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v22.4s\n"
"fmin v9.4s, v9.4s, v22.4s\n"
@@ -531,14 +529,13 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"b 178f\n"
"45:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"46:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x14\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
@@ -546,6 +543,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x14\n"
"bgt 47f\n"
"cmp x14, #0x10\n"
"mov x27, x12\n"
@@ -564,19 +562,19 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"cbz x15, 48f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "ldr q12, [x15, #0x40]\n"
- "ldr q13, [x15, #0x50]\n"
- "add x15, x15, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x15, #0x40]\n"
+ "ldr q13, [x15, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x15, x15, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -586,117 +584,117 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"tbz %x[flags], #0, 63f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x14, #0x18\n"
- "add x24, x13, x20, LSL #2\n"
+ "add x23, x13, x20, LSL #2\n"
"bge 61f\n"
"tbz x14, #4, 52f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v12.4s }, [x13], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
"tbz x14, #2, 50f\n"
"ld1 { v13.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
"tbz x14, #1, 49f\n"
"ldr d20, [x13], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
"tbz x14, #0, 60f\n"
"ld1 { v20.s }[2], [x13]\n"
- "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
"b 60f\n"
"49:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x14, #0, 60f\n"
"ldr s20, [x13, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
"b 60f\n"
"50:" // Height 2: Partial accumulate: partial_2_16
"tbz x14, #1, 51f\n"
"ldr d13, [x13], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
"tbz x14, #0, 60f\n"
"ld1 { v13.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
"b 60f\n"
"51:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x14, #0, 60f\n"
"ldr s13, [x13, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
"b 60f\n"
"52:" // Height 2: Partial accumulate: partial_8_0
"tbz x14, #3, 56f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"tbz x14, #2, 54f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"tbz x14, #1, 53f\n"
"ldr d12, [x13], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
"tbz x14, #0, 60f\n"
"ld1 { v12.s }[2], [x13]\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
"b 60f\n"
"53:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 60f\n"
"ldr s12, [x13, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
"b 60f\n"
"54:" // Height 2: Partial accumulate: partial_2_8
"tbz x14, #1, 55f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
"tbz x14, #0, 60f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
"b 60f\n"
"55:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 60f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
"b 60f\n"
"56:" // Height 2: Partial accumulate: partial_4_0
"tbz x14, #2, 58f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"tbz x14, #1, 57f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
"tbz x14, #0, 60f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x23]\n"
"b 60f\n"
"57:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 60f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
"b 60f\n"
"58:" // Height 2: Partial accumulate: partial_2_0
"tbz x14, #1, 59f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
"tbz x14, #0, 60f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x23]\n"
"b 60f\n"
"59:" // Height 2: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
"60:" // Height 2: Partial accumulate: Done
"sub x13, x13, x20\n"
@@ -708,12 +706,12 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr q12, [x13, #0x30]\n"
"ldr q13, [x13, #0x40]\n"
"ldr q20, [x13, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
"62:" // Height 2: MMLA fixup
"zip1 v8.2d, v9.2d, v14.2d\n"
"zip2 v14.2d, v9.2d, v14.2d\n"
@@ -745,8 +743,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"65:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 66f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -773,72 +771,72 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"blt 69f\n"
"68:" // Height 2: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
- "cmp x25, #0x8\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
"ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q29, [x10, #0x0]\n"
+ "ldr q30, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
"ldr q23, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"ldr q22, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"ldr q21, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
- ".inst 0x6e5dec0a // bfmmla v10.4s, v0.8h, v29.8h\n"
- "ldr q30, [x28, #0x0]\n"
+ ".inst 0x6e5eec0a // bfmmla v10.4s, v0.8h, v30.8h\n"
+ "ldr q2, [x28, #0x0]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
"ldr q23, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
"ldr q22, [x27, #0x0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- ".inst 0x6e5eec0c // bfmmla v12.4s, v0.8h, v30.8h\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x8\n"
+ "add x12, x12, #0x20\n"
+ ".inst 0x6e42ec0c // bfmmla v12.4s, v0.8h, v2.8h\n"
"ldr q4, [x12, #0x0]\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x12, #0x10]\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x11, #0x0]\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"ld1 { v0.4s }, [x24], #0x10\n"
+ "add x10, x10, #0x20\n"
"ldr q7, [x11, #0x10]\n"
+ "add x9, x9, #0x20\n"
+ "add x28, x28, #0x20\n"
+ "add x27, x27, #0x20\n"
"bge 68b\n"
"69:" // Height 2: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"ldr q24, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
"ldr q23, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"ldr q22, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"ldr q21, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
"ldr q24, [x28, #0x0]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
"ldr q23, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
"ldr q22, [x27, #0x0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
+ "sub x25, x25, #0x4\n"
".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "add x28, x28, #0x20\n"
+ "add x27, x27, #0x20\n"
"70:" // Height 2: Multiply loop: Main loop skip
"cbz x25, 73f\n"
"cbz x25, 73f\n"
@@ -856,35 +854,35 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr q24, [x12, #0x0]\n"
"ldr q23, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "add x12, x12, #0x20\n"
+ ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
"ldr q22, [x11, #0x0]\n"
"ldr q21, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
- ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e58ec08 // bfmmla v8.4s, v0.8h, v24.8h\n"
- "ldr q24, [x10, #0x0]\n"
".inst 0x6e57ec0e // bfmmla v14.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x10, #0x0]\n"
"ldr q23, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n"
- "ldr q22, [x9, #0x0]\n"
".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x9, #0x0]\n"
"ldr q21, [x9, #0x10]\n"
- "add x9, x9, #0x20\n"
".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
- "ldr q24, [x28, #0x0]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x28, #0x0]\n"
"ldr q23, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
- "ldr q22, [x27, #0x0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x27, #0x0]\n"
"ldr q21, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "add x28, x28, #0x20\n"
+ "add x27, x27, #0x20\n"
"73:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -893,21 +891,21 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v4.2d, v8.2d, v14.2d\n"
"uzp2 v8.2d, v8.2d, v14.2d\n"
+ "add x23, x13, x20, LSL #2\n"
"uzp1 v14.2d, v9.2d, v15.2d\n"
"uzp2 v9.2d, v9.2d, v15.2d\n"
"uzp1 v15.2d, v10.2d, v16.2d\n"
"uzp2 v10.2d, v10.2d, v16.2d\n"
"uzp1 v16.2d, v11.2d, v17.2d\n"
"uzp2 v11.2d, v11.2d, v17.2d\n"
- "add x24, x13, x20, LSL #2\n"
"uzp1 v17.2d, v12.2d, v18.2d\n"
"uzp2 v12.2d, v12.2d, v18.2d\n"
"uzp1 v18.2d, v13.2d, v19.2d\n"
"uzp2 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 74f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v22.4s }, [x21]\n"
"ld1r { v21.4s }, [x20]\n"
"fmin v4.4s, v4.4s, v22.4s\n"
"fmin v14.4s, v14.4s, v22.4s\n"
@@ -941,99 +939,99 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"st1 { v14.4s }, [x13], #0x10\n"
"st1 { v15.4s }, [x13], #0x10\n"
"st1 { v16.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v11.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v11.4s }, [x23], #0x10\n"
"tbz x14, #2, 76f\n"
"st1 { v17.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
"tbz x14, #1, 75f\n"
"str d18, [x13], #0x8\n"
- "str d13, [x24], #0x8\n"
+ "str d13, [x23], #0x8\n"
"tbz x14, #0, 86f\n"
"st1 { v18.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x24]\n"
+ "st1 { v13.s }[2], [x23]\n"
"b 86f\n"
"75:" // Height 2: Partial direct writeback: partial_1_20
"tbz x14, #0, 86f\n"
"str s18, [x13, #0x0]\n"
- "str s13, [x24, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
"b 86f\n"
"76:" // Height 2: Partial direct writeback: partial_2_16
"tbz x14, #1, 77f\n"
"str d17, [x13], #0x8\n"
- "str d12, [x24], #0x8\n"
+ "str d12, [x23], #0x8\n"
"tbz x14, #0, 86f\n"
"st1 { v17.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x24]\n"
+ "st1 { v12.s }[2], [x23]\n"
"b 86f\n"
"77:" // Height 2: Partial direct writeback: partial_1_16
"tbz x14, #0, 86f\n"
"str s17, [x13, #0x0]\n"
- "str s12, [x24, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
"b 86f\n"
"78:" // Height 2: Partial direct writeback: partial_8_0
"tbz x14, #3, 82f\n"
"st1 { v4.4s }, [x13], #0x10\n"
"st1 { v14.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
"tbz x14, #2, 80f\n"
"st1 { v15.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
"tbz x14, #1, 79f\n"
"str d16, [x13], #0x8\n"
- "str d11, [x24], #0x8\n"
+ "str d11, [x23], #0x8\n"
"tbz x14, #0, 86f\n"
"st1 { v16.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x24]\n"
+ "st1 { v11.s }[2], [x23]\n"
"b 86f\n"
"79:" // Height 2: Partial direct writeback: partial_1_12
"tbz x14, #0, 86f\n"
"str s16, [x13, #0x0]\n"
- "str s11, [x24, #0x0]\n"
+ "str s11, [x23, #0x0]\n"
"b 86f\n"
"80:" // Height 2: Partial direct writeback: partial_2_8
"tbz x14, #1, 81f\n"
"str d15, [x13], #0x8\n"
- "str d10, [x24], #0x8\n"
+ "str d10, [x23], #0x8\n"
"tbz x14, #0, 86f\n"
"st1 { v15.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x24]\n"
+ "st1 { v10.s }[2], [x23]\n"
"b 86f\n"
"81:" // Height 2: Partial direct writeback: partial_1_8
"tbz x14, #0, 86f\n"
"str s15, [x13, #0x0]\n"
- "str s10, [x24, #0x0]\n"
+ "str s10, [x23, #0x0]\n"
"b 86f\n"
"82:" // Height 2: Partial direct writeback: partial_4_0
"tbz x14, #2, 84f\n"
"st1 { v4.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
"tbz x14, #1, 83f\n"
"str d14, [x13], #0x8\n"
- "str d9, [x24], #0x8\n"
+ "str d9, [x23], #0x8\n"
"tbz x14, #0, 86f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x24]\n"
+ "st1 { v9.s }[2], [x23]\n"
"b 86f\n"
"83:" // Height 2: Partial direct writeback: partial_1_4
"tbz x14, #0, 86f\n"
"str s14, [x13, #0x0]\n"
- "str s9, [x24, #0x0]\n"
+ "str s9, [x23, #0x0]\n"
"b 86f\n"
"84:" // Height 2: Partial direct writeback: partial_2_0
"tbz x14, #1, 85f\n"
"str d4, [x13], #0x8\n"
- "str d8, [x24], #0x8\n"
+ "str d8, [x23], #0x8\n"
"tbz x14, #0, 86f\n"
"st1 { v4.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x24]\n"
+ "st1 { v8.s }[2], [x23]\n"
"b 86f\n"
"85:" // Height 2: Partial direct writeback: partial_1_0
"str s4, [x13, #0x0]\n"
- "str s8, [x24, #0x0]\n"
+ "str s8, [x23, #0x0]\n"
"86:" // Height 2: Partial direct writeback: Done
"b 88f\n"
"87:" // Height 2: Full writeback
@@ -1044,26 +1042,25 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"str q17, [x13, #0x40]\n"
"str q18, [x13, #0x50]\n"
"add x13, x13, #0x60\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q12, [x24, #0x40]\n"
- "str q13, [x24, #0x50]\n"
+ "str q8, [x23, #0x0]\n"
+ "str q9, [x23, #0x10]\n"
+ "str q10, [x23, #0x20]\n"
+ "str q11, [x23, #0x30]\n"
+ "str q12, [x23, #0x40]\n"
+ "str q13, [x23, #0x50]\n"
"88:" // Height 2: Writeback done
"subs x14, x14, #0x18\n"
"bgt 46b\n"
"b 178f\n"
"89:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"90:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x14\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
@@ -1071,6 +1068,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x14\n"
"bgt 91f\n"
"cmp x14, #0x10\n"
"mov x27, x12\n"
@@ -1089,19 +1087,19 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"cbz x15, 92f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "ldr q12, [x15, #0x40]\n"
- "ldr q13, [x15, #0x50]\n"
- "add x15, x15, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x15, #0x40]\n"
+ "ldr q13, [x15, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x15, x15, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -1122,147 +1120,147 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"92:" // Height 3: no bias
"tbz %x[flags], #0, 107f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
"cmp x14, #0x18\n"
- "add x24, x13, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 105f\n"
"tbz x14, #4, 96f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
"ld1 { v12.4s }, [x13], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x14, #2, 94f\n"
"ld1 { v13.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x14, #1, 93f\n"
"ldr d20, [x13], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
"tbz x14, #0, 104f\n"
"ld1 { v20.s }[2], [x13]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v4.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v4.s }[2], [x22]\n"
"b 104f\n"
"93:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x14, #0, 104f\n"
"ldr s20, [x13, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s4, [x23, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s4, [x22, #0x0]\n"
"b 104f\n"
"94:" // Height 3: Partial accumulate: partial_2_16
"tbz x14, #1, 95f\n"
"ldr d13, [x13], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x14, #0, 104f\n"
"ld1 { v13.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 104f\n"
"95:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x14, #0, 104f\n"
"ldr s13, [x13, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 104f\n"
"96:" // Height 3: Partial accumulate: partial_8_0
"tbz x14, #3, 100f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"tbz x14, #2, 98f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
"tbz x14, #1, 97f\n"
"ldr d12, [x13], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x14, #0, 104f\n"
"ld1 { v12.s }[2], [x13]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 104f\n"
"97:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 104f\n"
"ldr s12, [x13, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"b 104f\n"
"98:" // Height 3: Partial accumulate: partial_2_8
"tbz x14, #1, 99f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
- "ldr d23, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
"tbz x14, #0, 104f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
"b 104f\n"
"99:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 104f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
"b 104f\n"
"100:" // Height 3: Partial accumulate: partial_4_0
"tbz x14, #2, 102f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"tbz x14, #1, 101f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz x14, #0, 104f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
"b 104f\n"
"101:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 104f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
"b 104f\n"
"102:" // Height 3: Partial accumulate: partial_2_0
"tbz x14, #1, 103f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
- "ldr d21, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
"tbz x14, #0, 104f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
"b 104f\n"
"103:" // Height 3: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s21, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
"104:" // Height 3: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 106f\n"
@@ -1273,18 +1271,18 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr q12, [x13, #0x30]\n"
"ldr q13, [x13, #0x40]\n"
"ldr q20, [x13, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q21, [x23, #0x0]\n"
- "ldr q22, [x23, #0x10]\n"
- "ldr q23, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q25, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q21, [x22, #0x0]\n"
+ "ldr q22, [x22, #0x10]\n"
+ "ldr q23, [x22, #0x20]\n"
+ "ldr q24, [x22, #0x30]\n"
+ "ldr q25, [x22, #0x40]\n"
+ "ldr q4, [x22, #0x50]\n"
"106:" // Height 3: MMLA fixup
"zip1 v8.2d, v9.2d, v14.2d\n"
"zip2 v14.2d, v9.2d, v14.2d\n"
@@ -1340,8 +1338,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"109:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 110f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1372,42 +1370,42 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"blt 113f\n"
"112:" // Height 3: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
- "cmp x25, #0x8\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
"ld1 { v1.4s }, [x23], #0x10\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q4, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q5, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "sub x25, x25, #0x4\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "cmp x25, #0x8\n"
+ ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q3, [x9, #0x10]\n"
- "add x10, x10, #0x20\n"
- "add x9, x9, #0x20\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x0]\n"
".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x10]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q6, [x27, #0x0]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e43ec11 // bfmmla v17.4s, v0.8h, v3.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e43ec5d // bfmmla v29.4s, v2.8h, v3.8h\n"
"ldr q3, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
+ "add x28, x28, #0x20\n"
".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ "add x27, x27, #0x20\n"
".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
"ldr q4, [x12, #0x0]\n"
".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
@@ -1424,35 +1422,35 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"bge 112b\n"
"113:" // Height 3: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ "sub x25, x25, #0x4\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q3, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q4, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "add x12, x12, #0x20\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "add x11, x11, #0x20\n"
+ ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q1, [x9, #0x10]\n"
- "add x10, x10, #0x20\n"
- "add x9, x9, #0x20\n"
".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
"ldr q5, [x28, #0x0]\n"
".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x10]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ "add x28, x28, #0x20\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q3, [x27, #0x0]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
"ldr q1, [x27, #0x10]\n"
@@ -1485,41 +1483,41 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr q5, [x12, #0x0]\n"
"ldr q4, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "ldr q3, [x11, #0x0]\n"
- "ldr q6, [x11, #0x10]\n"
- "add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
- ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n"
- ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ "ldr q3, [x11, #0x0]\n"
+ "ldr q1, [x11, #0x10]\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
"ldr q5, [x10, #0x0]\n"
".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
+ "add x12, x12, #0x20\n"
+ ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
"ldr q4, [x10, #0x10]\n"
".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ "add x11, x11, #0x20\n"
+ ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n"
"ldr q3, [x9, #0x0]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q1, [x9, #0x10]\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
"add x10, x10, #0x20\n"
- "add x9, x9, #0x20\n"
+ ".inst 0x6e41ec5b // bfmmla v27.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x9, #0x10]\n"
".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x0]\n"
".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x10]\n"
".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ "add x28, x28, #0x20\n"
".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
"ldr q3, [x27, #0x0]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
"ldr q1, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ "add x27, x27, #0x20\n"
".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
@@ -1533,16 +1531,16 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"cmp x26, x20\n"
"bne 109b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
"uzp1 v4.2d, v8.2d, v14.2d\n"
"uzp2 v8.2d, v8.2d, v14.2d\n"
"uzp1 v14.2d, v9.2d, v15.2d\n"
"uzp2 v9.2d, v9.2d, v15.2d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 v15.2d, v10.2d, v16.2d\n"
"uzp2 v10.2d, v10.2d, v16.2d\n"
- "add x24, x13, x20, LSL #2\n"
"uzp1 v16.2d, v11.2d, v17.2d\n"
"uzp2 v11.2d, v11.2d, v17.2d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v17.2d, v12.2d, v18.2d\n"
"uzp2 v12.2d, v12.2d, v18.2d\n"
"uzp1 v18.2d, v13.2d, v19.2d\n"
@@ -1554,9 +1552,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v24.2d, v24.2d, v30.2d\n"
"uzp1 v25.2d, v25.2d, v31.2d\n"
"tbz %x[flags], #1, 118f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v4.4s, v4.4s, v1.4s\n"
"fmin v14.4s, v14.4s, v1.4s\n"
@@ -1602,126 +1600,126 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"st1 { v14.4s }, [x13], #0x10\n"
"st1 { v15.4s }, [x13], #0x10\n"
"st1 { v16.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v11.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v11.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
"tbz x14, #2, 120f\n"
"st1 { v17.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x14, #1, 119f\n"
"str d18, [x13], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x14, #0, 130f\n"
"st1 { v18.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 130f\n"
"119:" // Height 3: Partial direct writeback: partial_1_20
"tbz x14, #0, 130f\n"
"str s18, [x13, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 130f\n"
"120:" // Height 3: Partial direct writeback: partial_2_16
"tbz x14, #1, 121f\n"
"str d17, [x13], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x14, #0, 130f\n"
"st1 { v17.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 130f\n"
"121:" // Height 3: Partial direct writeback: partial_1_16
"tbz x14, #0, 130f\n"
"str s17, [x13, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"b 130f\n"
"122:" // Height 3: Partial direct writeback: partial_8_0
"tbz x14, #3, 126f\n"
"st1 { v4.4s }, [x13], #0x10\n"
"st1 { v14.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
"tbz x14, #2, 124f\n"
"st1 { v15.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
"tbz x14, #1, 123f\n"
"str d16, [x13], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
+ "str d11, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
"tbz x14, #0, 130f\n"
"st1 { v16.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
+ "st1 { v11.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
"b 130f\n"
"123:" // Height 3: Partial direct writeback: partial_1_12
"tbz x14, #0, 130f\n"
"str s16, [x13, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
+ "str s11, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
"b 130f\n"
"124:" // Height 3: Partial direct writeback: partial_2_8
"tbz x14, #1, 125f\n"
"str d15, [x13], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
+ "str d10, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
"tbz x14, #0, 130f\n"
"st1 { v15.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
+ "st1 { v10.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
"b 130f\n"
"125:" // Height 3: Partial direct writeback: partial_1_8
"tbz x14, #0, 130f\n"
"str s15, [x13, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
+ "str s10, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
"b 130f\n"
"126:" // Height 3: Partial direct writeback: partial_4_0
"tbz x14, #2, 128f\n"
"st1 { v4.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
"tbz x14, #1, 127f\n"
"str d14, [x13], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
+ "str d9, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
"tbz x14, #0, 130f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
+ "st1 { v9.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
"b 130f\n"
"127:" // Height 3: Partial direct writeback: partial_1_4
"tbz x14, #0, 130f\n"
"str s14, [x13, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
+ "str s9, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
"b 130f\n"
"128:" // Height 3: Partial direct writeback: partial_2_0
"tbz x14, #1, 129f\n"
"str d4, [x13], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
+ "str d8, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
"tbz x14, #0, 130f\n"
"st1 { v4.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
+ "st1 { v8.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
"b 130f\n"
"129:" // Height 3: Partial direct writeback: partial_1_0
"str s4, [x13, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
+ "str s8, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
"130:" // Height 3: Partial direct writeback: Done
"b 132f\n"
"131:" // Height 3: Full writeback
@@ -1732,36 +1730,34 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"str q17, [x13, #0x40]\n"
"str q18, [x13, #0x50]\n"
"add x13, x13, #0x60\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q12, [x24, #0x40]\n"
- "str q13, [x24, #0x50]\n"
- "str q20, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q22, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q25, [x23, #0x50]\n"
+ "str q8, [x23, #0x0]\n"
+ "str q9, [x23, #0x10]\n"
+ "str q10, [x23, #0x20]\n"
+ "str q11, [x23, #0x30]\n"
+ "str q12, [x23, #0x40]\n"
+ "str q13, [x23, #0x50]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x22, #0x40]\n"
+ "str q25, [x22, #0x50]\n"
"132:" // Height 3: Writeback done
"subs x14, x14, #0x18\n"
"bgt 90b\n"
"b 178f\n"
"133:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x10\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x10\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"134:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x14, #0x14\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
@@ -1769,6 +1765,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x14, #0x14\n"
"bgt 135f\n"
"cmp x14, #0x10\n"
"mov x27, x12\n"
@@ -1787,19 +1784,19 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"cbz x15, 136f\n"
"ldr q8, [x15, #0x0]\n"
"ldr q9, [x15, #0x10]\n"
- "ldr q10, [x15, #0x20]\n"
- "ldr q11, [x15, #0x30]\n"
- "ldr q12, [x15, #0x40]\n"
- "ldr q13, [x15, #0x50]\n"
- "add x15, x15, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x15, #0x40]\n"
+ "ldr q13, [x15, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x15, x15, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -1820,175 +1817,175 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"136:" // Height 4: no bias
"tbz %x[flags], #0, 151f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x14, #0x18\n"
- "add x24, x13, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x13, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x14, #0x18\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 149f\n"
"tbz x14, #4, 140f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v12.4s }, [x13], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x14, #2, 138f\n"
"ld1 { v13.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x14, #1, 137f\n"
"ldr d20, [x13], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v20.s }[2], [x13]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v4.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v4.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 148f\n"
"137:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x14, #0, 148f\n"
"ldr s20, [x13, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s4, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s4, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 148f\n"
"138:" // Height 4: Partial accumulate: partial_2_16
"tbz x14, #1, 139f\n"
"ldr d13, [x13], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v13.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 148f\n"
"139:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x14, #0, 148f\n"
"ldr s13, [x13, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 148f\n"
"140:" // Height 4: Partial accumulate: partial_8_0
"tbz x14, #3, 144f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"tbz x14, #2, 142f\n"
"ld1 { v11.4s }, [x13], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x14, #1, 141f\n"
"ldr d12, [x13], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v12.s }[2], [x13]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 148f\n"
"141:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x14, #0, 148f\n"
"ldr s12, [x13, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 148f\n"
"142:" // Height 4: Partial accumulate: partial_2_8
"tbz x14, #1, 143f\n"
"ldr d11, [x13], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v11.s }[2], [x13]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 148f\n"
"143:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x14, #0, 148f\n"
"ldr s11, [x13, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"b 148f\n"
"144:" // Height 4: Partial accumulate: partial_4_0
"tbz x14, #2, 146f\n"
"ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"tbz x14, #1, 145f\n"
"ldr d10, [x13], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v10.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
- "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
"b 148f\n"
"145:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x14, #0, 148f\n"
"ldr s10, [x13, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
- "ldr s27, [x22, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
"b 148f\n"
"146:" // Height 4: Partial accumulate: partial_2_0
"tbz x14, #1, 147f\n"
"ldr d9, [x13], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d26, [x22], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
"tbz x14, #0, 148f\n"
"ld1 { v9.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
- "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
"b 148f\n"
"147:" // Height 4: Partial accumulate: partial_1_0
"ldr s9, [x13, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s21, [x23, #0x0]\n"
- "ldr s26, [x22, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
"148:" // Height 4: Partial accumulate: Done
"sub x13, x13, x20\n"
"b 150f\n"
@@ -1999,24 +1996,24 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr q12, [x13, #0x30]\n"
"ldr q13, [x13, #0x40]\n"
"ldr q20, [x13, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q21, [x23, #0x0]\n"
- "ldr q22, [x23, #0x10]\n"
- "ldr q23, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q25, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
- "ldr q26, [x22, #0x0]\n"
- "ldr q27, [x22, #0x10]\n"
- "ldr q28, [x22, #0x20]\n"
- "ldr q29, [x22, #0x30]\n"
- "ldr q30, [x22, #0x40]\n"
- "ldr q31, [x22, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q21, [x22, #0x0]\n"
+ "ldr q22, [x22, #0x10]\n"
+ "ldr q23, [x22, #0x20]\n"
+ "ldr q24, [x22, #0x30]\n"
+ "ldr q25, [x22, #0x40]\n"
+ "ldr q4, [x22, #0x50]\n"
+ "ldr q26, [x21, #0x0]\n"
+ "ldr q27, [x21, #0x10]\n"
+ "ldr q28, [x21, #0x20]\n"
+ "ldr q29, [x21, #0x30]\n"
+ "ldr q30, [x21, #0x40]\n"
+ "ldr q31, [x21, #0x50]\n"
"150:" // Height 4: MMLA fixup
"zip1 v8.2d, v9.2d, v14.2d\n"
"zip2 v14.2d, v9.2d, v14.2d\n"
@@ -2072,8 +2069,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"153:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 154f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2110,26 +2107,26 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x25, x25, #0x4\n"
- "add x12, x12, #0x20\n"
"cmp x25, #0x8\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
"ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
"ld1 { v3.4s }, [x21], #0x10\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q4, [x10, #0x0]\n"
+ "add x12, x12, #0x20\n"
+ ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q5, [x10, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x9, #0x0]\n"
+ "add x10, x10, #0x20\n"
+ ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q7, [x9, #0x10]\n"
- "add x10, x10, #0x20\n"
"add x9, x9, #0x20\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
@@ -2137,10 +2134,10 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q6, [x27, #0x0]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
"ldr q7, [x27, #0x10]\n"
@@ -2165,31 +2162,31 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x25, x25, #0x4\n"
"add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q3, [x10, #0x0]\n"
+ ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q4, [x10, #0x10]\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x9, #0x0]\n"
+ ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q1, [x9, #0x10]\n"
- "add x10, x10, #0x20\n"
- "add x9, x9, #0x20\n"
".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ "add x9, x9, #0x20\n"
".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
"ldr q5, [x28, #0x0]\n"
".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ "add x28, x28, #0x20\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q3, [x27, #0x0]\n"
".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
@@ -2230,39 +2227,39 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"ldr q7, [x11, #0x0]\n"
"ldr q6, [x11, #0x10]\n"
- "add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
"ldr q5, [x10, #0x0]\n"
+ "add x12, x12, #0x20\n"
".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
"ldr q4, [x10, #0x10]\n"
- "add x10, x10, #0x20\n"
+ "add x11, x11, #0x20\n"
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
"ldr q3, [x9, #0x0]\n"
+ "add x10, x10, #0x20\n"
".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
"ldr q1, [x9, #0x10]\n"
- ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
"add x9, x9, #0x20\n"
+ ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x0]\n"
".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x10]\n"
- ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
"add x28, x28, #0x20\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
"ldr q3, [x27, #0x0]\n"
".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
"ldr q1, [x27, #0x10]\n"
- ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
"add x27, x27, #0x20\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
@@ -2276,17 +2273,17 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"cmp x26, x20\n"
"bne 153b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 v4.2d, v8.2d, v14.2d\n"
"uzp2 v8.2d, v8.2d, v14.2d\n"
"uzp1 v14.2d, v9.2d, v15.2d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v9.2d, v9.2d, v15.2d\n"
"uzp1 v15.2d, v10.2d, v16.2d\n"
"uzp2 v10.2d, v10.2d, v16.2d\n"
- "add x24, x13, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v16.2d, v11.2d, v17.2d\n"
"uzp2 v11.2d, v11.2d, v17.2d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v17.2d, v12.2d, v18.2d\n"
"uzp2 v12.2d, v12.2d, v18.2d\n"
"uzp1 v18.2d, v13.2d, v19.2d\n"
@@ -2304,9 +2301,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v30.2d, v25.2d, v31.2d\n"
"uzp2 v25.2d, v25.2d, v31.2d\n"
"tbz %x[flags], #1, 162f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v4.4s, v4.4s, v1.4s\n"
"fmin v14.4s, v14.4s, v1.4s\n"
@@ -2364,153 +2361,153 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"st1 { v14.4s }, [x13], #0x10\n"
"st1 { v15.4s }, [x13], #0x10\n"
"st1 { v16.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v11.4s }, [x24], #0x10\n"
- "st1 { v19.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v27.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
- "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v11.4s }, [x23], #0x10\n"
+ "st1 { v19.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v27.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v23.4s }, [x21], #0x10\n"
"tbz x14, #2, 164f\n"
"st1 { v17.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v29.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
"tbz x14, #1, 163f\n"
"str d18, [x13], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d25, [x22], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
"tbz x14, #0, 174f\n"
"st1 { v18.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v25.s }[2], [x22]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
"b 174f\n"
"163:" // Height 4: Partial direct writeback: partial_1_20
"tbz x14, #0, 174f\n"
"str s18, [x13, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s25, [x22, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
"b 174f\n"
"164:" // Height 4: Partial direct writeback: partial_2_16
"tbz x14, #1, 165f\n"
"str d17, [x13], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x14, #0, 174f\n"
"st1 { v17.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v24.s }[2], [x22]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
"b 174f\n"
"165:" // Height 4: Partial direct writeback: partial_1_16
"tbz x14, #0, 174f\n"
"str s17, [x13, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
"b 174f\n"
"166:" // Height 4: Partial direct writeback: partial_8_0
"tbz x14, #3, 170f\n"
"st1 { v4.4s }, [x13], #0x10\n"
"st1 { v14.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v19.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v19.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
"tbz x14, #2, 168f\n"
"st1 { v15.4s }, [x13], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v27.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v27.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
"tbz x14, #1, 167f\n"
"str d16, [x13], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d28, [x23], #0x8\n"
- "str d23, [x22], #0x8\n"
+ "str d11, [x23], #0x8\n"
+ "str d28, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
"tbz x14, #0, 174f\n"
"st1 { v16.s }[2], [x13]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v28.s }[2], [x23]\n"
- "st1 { v23.s }[2], [x22]\n"
+ "st1 { v11.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
"b 174f\n"
"167:" // Height 4: Partial direct writeback: partial_1_12
"tbz x14, #0, 174f\n"
"str s16, [x13, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s28, [x23, #0x0]\n"
- "str s23, [x22, #0x0]\n"
+ "str s11, [x23, #0x0]\n"
+ "str s28, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
"b 174f\n"
"168:" // Height 4: Partial direct writeback: partial_2_8
"tbz x14, #1, 169f\n"
"str d15, [x13], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d22, [x22], #0x8\n"
+ "str d10, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
"tbz x14, #0, 174f\n"
"st1 { v15.s }[2], [x13]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v22.s }[2], [x22]\n"
+ "st1 { v10.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
"b 174f\n"
"169:" // Height 4: Partial direct writeback: partial_1_8
"tbz x14, #0, 174f\n"
"str s15, [x13, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s22, [x22, #0x0]\n"
+ "str s10, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
"b 174f\n"
"170:" // Height 4: Partial direct writeback: partial_4_0
"tbz x14, #2, 172f\n"
"st1 { v4.4s }, [x13], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v19.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v19.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
"tbz x14, #1, 171f\n"
"str d14, [x13], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d21, [x22], #0x8\n"
+ "str d9, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
"tbz x14, #0, 174f\n"
"st1 { v14.s }[2], [x13]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v21.s }[2], [x22]\n"
+ "st1 { v9.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
"b 174f\n"
"171:" // Height 4: Partial direct writeback: partial_1_4
"tbz x14, #0, 174f\n"
"str s14, [x13, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s21, [x22, #0x0]\n"
+ "str s9, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
"b 174f\n"
"172:" // Height 4: Partial direct writeback: partial_2_0
"tbz x14, #1, 173f\n"
"str d4, [x13], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
+ "str d8, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
"tbz x14, #0, 174f\n"
"st1 { v4.s }[2], [x13]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "st1 { v20.s }[2], [x22]\n"
+ "st1 { v8.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
"b 174f\n"
"173:" // Height 4: Partial direct writeback: partial_1_0
"str s4, [x13, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "str s20, [x22, #0x0]\n"
+ "str s8, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
"174:" // Height 4: Partial direct writeback: Done
"b 176f\n"
"175:" // Height 4: Full writeback
@@ -2521,24 +2518,24 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"str q17, [x13, #0x40]\n"
"str q18, [x13, #0x50]\n"
"add x13, x13, #0x60\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q12, [x24, #0x40]\n"
- "str q13, [x24, #0x50]\n"
- "str q19, [x23, #0x0]\n"
- "str q26, [x23, #0x10]\n"
- "str q27, [x23, #0x20]\n"
- "str q28, [x23, #0x30]\n"
- "str q29, [x23, #0x40]\n"
- "str q30, [x23, #0x50]\n"
- "str q20, [x22, #0x0]\n"
- "str q21, [x22, #0x10]\n"
- "str q22, [x22, #0x20]\n"
- "str q23, [x22, #0x30]\n"
- "str q24, [x22, #0x40]\n"
- "str q25, [x22, #0x50]\n"
+ "str q8, [x23, #0x0]\n"
+ "str q9, [x23, #0x10]\n"
+ "str q10, [x23, #0x20]\n"
+ "str q11, [x23, #0x30]\n"
+ "str q12, [x23, #0x40]\n"
+ "str q13, [x23, #0x50]\n"
+ "str q19, [x22, #0x0]\n"
+ "str q26, [x22, #0x10]\n"
+ "str q27, [x22, #0x20]\n"
+ "str q28, [x22, #0x30]\n"
+ "str q29, [x22, #0x40]\n"
+ "str q30, [x22, #0x50]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x21, #0x40]\n"
+ "str q25, [x21, #0x50]\n"
"176:" // Height 4: Writeback done
"subs x14, x14, #0x18\n"
"bgt 134b\n"
@@ -2554,8 +2551,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"178:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp
index 376920a17b..98f7fc9403 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -82,7 +82,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp
index 9497508289..9ab4aa98f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -50,8 +50,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
+ void *output_ptr = nullptr;
+ const float *bias = nullptr;
} ka;
unsigned long flags=0;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
index eb08de0ade..49973ddb92 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
@@ -41,8 +41,7 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( ARGLIST );
class cls_a64_ffinterleaved_bf16fp32_dot_8x12
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -73,8 +72,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 2> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
index 349ad1c985..6fd5fd4b6c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
@@ -54,17 +54,17 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x24, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
"ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x25, #0x8\n"
- "mov %x[Apanel], x24\n"
"add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
"cmp x25, #0x4\n"
"mov x21, x23\n"
@@ -79,12 +79,12 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
"movi v9.16b, #0x0\n"
"ldr q6, [x21, #0x0]\n"
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
"movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
- "cmp x20, #0x2\n"
"movi v15.16b, #0x0\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
@@ -170,18 +170,18 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
"ldr q6, [x21, #0x0]\n"
"bge 4b\n"
"5:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "add x23, x23, #0x10\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
".inst 0x4f60f08b // bfdot v11.4s, v4.8h, v0.h[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n"
".inst 0x4f60f891 // bfdot v17.4s, v4.8h, v0.h[3]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
".inst 0x4f61f097 // bfdot v23.4s, v4.8h, v1.h[1]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f41f89a // bfdot v26.4s, v4.8h, v1.h[2]\n"
".inst 0x4f61f89d // bfdot v29.4s, v4.8h, v1.h[3]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f40f0a9 // bfdot v9.4s, v5.8h, v0.h[0]\n"
".inst 0x4f60f0ac // bfdot v12.4s, v5.8h, v0.h[1]\n"
".inst 0x4f40f8af // bfdot v15.4s, v5.8h, v0.h[2]\n"
@@ -204,8 +204,8 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ldr q2, [x23, #0x0]\n"
"ldr q1, [x22, #0x0]\n"
- "ldr q0, [x21, #0x0]\n"
".inst 0x4f44f048 // bfdot v8.4s, v2.8h, v4.h[0]\n"
+ "ldr q0, [x21, #0x0]\n"
".inst 0x4f64f04b // bfdot v11.4s, v2.8h, v4.h[1]\n"
".inst 0x4f44f84e // bfdot v14.4s, v2.8h, v4.h[2]\n"
".inst 0x4f64f851 // bfdot v17.4s, v2.8h, v4.h[3]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
index eb382952fa..1a8b0fd630 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
@@ -41,8 +41,7 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12( ARGLIST );
class cls_a64_ffinterleaved_bf16fp32_mmla_8x12
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -73,8 +72,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
@@ -89,8 +88,10 @@ public:
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
+ case CPUModel::V1:
+ return { 45.25, 4.29, 4.80 };
default:
- return { 38.10, 5.23, 3.15 };
+ return { 29.85, 2.60, 5.49 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
index 5331f9e652..658491571d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -54,17 +54,17 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x24, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
"ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x25, #0x8\n"
- "mov %x[Apanel], x24\n"
"add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
"cmp x25, #0x4\n"
"mov x21, x23\n"
@@ -79,14 +79,14 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
"movi v9.16b, #0x0\n"
"ldr q2, [%x[Apanel], #0x20]\n"
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
- "movi v12.16b, #0x0\n"
"add x23, x23, #0x20\n"
+ "movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
- "movi v14.16b, #0x0\n"
"add %x[Apanel], %x[Apanel], #0x30\n"
- "cmp x20, #0x2\n"
+ "movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
@@ -217,19 +217,19 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
"cbz x20, 6f\n"
"ldr q1, [x23, #0x0]\n"
"ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
"ldr q6, [%x[Apanel], #0x10]\n"
"ldr q0, [x23, #0x10]\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
"ldr q5, [%x[Apanel], #0x20]\n"
"ldr q4, [%x[Apanel], #0x30]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
+ ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
"ldr q3, [x22, #0x0]\n"
"ldr q2, [x22, #0x10]\n"
- ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
- ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
- ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n"
".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n"
".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n"
"ldr q1, [x21, #0x0]\n"
".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
@@ -252,41 +252,41 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"6:" // multiply loop done
"subs x25, x25, #0xc\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
index 42136ce085..29e524a89b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
@@ -40,8 +40,7 @@ void a64_ffinterleaved_fp16_mla_8x24( ARGLIST );
class cls_a64_ffinterleaved_fp16_mla_8x24
{
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)( ARGLIST );
@@ -72,8 +71,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 24, 1> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 24, 1, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 24, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 24, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
index 2ad85ad424..744d05dbdf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
@@ -53,17 +53,17 @@ void a64_ffinterleaved_fp16_mla_8x24(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x24, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
"ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x25, #0x10\n"
- "mov %x[Apanel], x24\n"
"add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x25, #0x10\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
"cmp x25, #0x8\n"
"mov x21, x23\n"
@@ -77,13 +77,13 @@ void a64_ffinterleaved_fp16_mla_8x24(
"ldr q4, [x21, #0x0]\n"
"movi v9.16b, #0x0\n"
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
"movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
- "cmp x20, #0x2\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
"movi v18.16b, #0x0\n"
@@ -166,18 +166,18 @@ void a64_ffinterleaved_fp16_mla_8x24(
"fmla v31.8h, v1.8h, v7.h[7]\n"
"bge 4b\n"
"5:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "add x23, x23, #0x10\n"
"fmla v8.8h, v2.8h, v0.h[0]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
"fmla v11.8h, v2.8h, v0.h[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla v14.8h, v2.8h, v0.h[2]\n"
"fmla v17.8h, v2.8h, v0.h[3]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.8h, v2.8h, v0.h[4]\n"
"fmla v23.8h, v2.8h, v0.h[5]\n"
+ "add x22, x22, #0x10\n"
"fmla v26.8h, v2.8h, v0.h[6]\n"
"fmla v29.8h, v2.8h, v0.h[7]\n"
+ "add x21, x21, #0x10\n"
"fmla v9.8h, v3.8h, v0.h[0]\n"
"fmla v12.8h, v3.8h, v0.h[1]\n"
"fmla v15.8h, v3.8h, v0.h[2]\n"
@@ -197,13 +197,13 @@ void a64_ffinterleaved_fp16_mla_8x24(
"cbz x20, 6f\n"
"ldr q3, [%x[Apanel], #0x0]\n"
"ldr q2, [x23, #0x0]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
+ "fmla v8.8h, v2.8h, v3.h[0]\n"
"ldr q1, [x22, #0x0]\n"
"ldr q0, [x21, #0x0]\n"
- "fmla v8.8h, v2.8h, v3.h[0]\n"
"fmla v11.8h, v2.8h, v3.h[1]\n"
"fmla v14.8h, v2.8h, v3.h[2]\n"
"fmla v17.8h, v2.8h, v3.h[3]\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla v20.8h, v2.8h, v3.h[4]\n"
"fmla v23.8h, v2.8h, v3.h[5]\n"
"fmla v26.8h, v2.8h, v3.h[6]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
index bb6deaf68f..6b01ffe63b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
@@ -40,8 +40,7 @@ void a64_ffinterleaved_fp32_mla_8x12( ARGLIST );
class cls_a64_ffinterleaved_fp32_mla_8x12
{
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -72,8 +71,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 1> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
index 45970fdc0b..f93bc6c719 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
@@ -53,17 +53,17 @@ void a64_ffinterleaved_fp32_mla_8x12(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x24, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
"ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cmp x25, #0x8\n"
- "mov %x[Apanel], x24\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
"cmp x25, #0x4\n"
"mov x21, x23\n"
@@ -78,12 +78,12 @@ void a64_ffinterleaved_fp32_mla_8x12(
"movi v9.16b, #0x0\n"
"ldr q6, [x21, #0x0]\n"
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x4\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
"movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
- "cmp x20, #0x4\n"
"movi v15.16b, #0x0\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
@@ -227,18 +227,18 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v31.4s, v2.4s, v7.s[3]\n"
"bge 4b\n"
"5:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "add x23, x23, #0x10\n"
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
"fmla v11.4s, v4.4s, v0.s[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"fmla v14.4s, v4.4s, v0.s[2]\n"
"fmla v17.4s, v4.4s, v0.s[3]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.4s, v4.4s, v1.s[0]\n"
"fmla v23.4s, v4.4s, v1.s[1]\n"
+ "add x22, x22, #0x10\n"
"fmla v26.4s, v4.4s, v1.s[2]\n"
"fmla v29.4s, v4.4s, v1.s[3]\n"
+ "add x21, x21, #0x10\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
"fmla v12.4s, v5.4s, v0.s[1]\n"
"fmla v15.4s, v5.4s, v0.s[2]\n"
@@ -262,22 +262,22 @@ void a64_ffinterleaved_fp32_mla_8x12(
"subs x20, x20, #0x1\n"
"ldr q2, [x23, #0x0]\n"
"ldr q1, [x22, #0x0]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q0, [x21, #0x0]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
"fmla v8.4s, v2.4s, v4.s[0]\n"
+ "ldr q0, [x21, #0x0]\n"
"fmla v11.4s, v2.4s, v4.s[1]\n"
"fmla v14.4s, v2.4s, v4.s[2]\n"
"fmla v17.4s, v2.4s, v4.s[3]\n"
"fmla v20.4s, v2.4s, v3.s[0]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"fmla v23.4s, v2.4s, v3.s[1]\n"
"fmla v26.4s, v2.4s, v3.s[2]\n"
+ "add x23, x23, #0x10\n"
"fmla v29.4s, v2.4s, v3.s[3]\n"
"fmla v9.4s, v1.4s, v4.s[0]\n"
+ "add x22, x22, #0x10\n"
"fmla v12.4s, v1.4s, v4.s[1]\n"
"fmla v15.4s, v1.4s, v4.s[2]\n"
+ "add x21, x21, #0x10\n"
"fmla v18.4s, v1.4s, v4.s[3]\n"
"fmla v21.4s, v1.4s, v3.s[0]\n"
"fmla v24.4s, v1.4s, v3.s[1]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
index 8dc9112ebe..059c1f6d4d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
@@ -42,8 +42,7 @@ void a64_gemm_s16_asimd_8x12(const int16_t *, const int16_t *, int32_t *, int, i
// structure.
class cls_a64_gemm_s16_8x12 {
public:
- typedef int16_t lhs_operand_type;
- typedef int16_t rhs_operand_type;
+ typedef int16_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
@@ -62,8 +61,8 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
kern_type kernel = a64_gemm_s16_asimd_8x12;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index d0edcfcb5e..226c13b400 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -37,8 +37,7 @@ void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
class cls_a64_gemm_s8_4x4 {
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
@@ -57,8 +56,8 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 16> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
template<typename T>
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
index b55ac61403..1396c6ae8c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
@@ -55,37 +55,37 @@ void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel
register int8x16_t b3a asm("v11");
__asm __volatile (
- "movi v16.4s, #0x0\n"
- "ldr q0, [%[a_ptr]]\n"
- "movi v17.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v18.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v19.4s, #0x0\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "movi v20.4s, #0x0\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%[a_ptr], #16]\n"
- "movi v22.4s, #0x0\n"
- "ldr q2, [%[a_ptr], #32]\n"
- "movi v23.4s, #0x0\n"
- "ldr q3, [%[a_ptr], #48]\n"
- "movi v24.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "movi v24.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v25.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v26.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v27.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v28.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v29.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v30.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v31.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
// Loop structure optimized for A57 (after r0).
@@ -107,351 +107,351 @@ void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel
// of multiplies that need to be pulled out.
// Start of unroll 0 (first iteration)
- "smull v12.8h, v0.8b, %[b0].8b\n"
- "smull v13.8h, v0.8b, %[b1].8b\n"
+ "smull v12.8h, v0.8b, %[b0].8b\n"
+ "smull v13.8h, v0.8b, %[b1].8b\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Unroll 0 continuation (branch target)
"1:\n"
- "smull v14.8h, v0.8b, %[b2].8b\n"
- "subs %w[k], %w[k], #1\n"
- "smull v15.8h, v0.8b, %[b3].8b\n"
- "ldr %q[b0a], [%[b_ptr], #64]\n"
- "smlal2 v12.8h, v0.16b, %[b0].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1].16b\n"
- "ldr %q[b1a], [%[b_ptr], #80]\n"
- "smlal2 v14.8h, v0.16b, %[b2].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3].16b\n"
- "ldr q0, [%[a_ptr], #64]\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2].8b\n"
- "ldr %q[b2a], [%[b_ptr], #96]\n"
- "smull v15.8h, v1.8b, %[b3].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0].16b\n"
- "ldr %q[b3a], [%[b_ptr], #112]\n"
- "smlal2 v13.8h, v1.16b, %[b1].16b\n"
- "add %[b_ptr], %[b_ptr], #128\n"
- "smlal2 v14.8h, v1.16b, %[b2].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3].16b\n"
- "ldr q1, [%[a_ptr], #80]\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "smull v14.8h, v2.8b, %[b2].8b\n"
- "smull v15.8h, v2.8b, %[b3].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "subs %w[k], %w[k], #1\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "ldr %q[b0a], [%[b_ptr], #64]\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "ldr %q[b1a], [%[b_ptr], #80]\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr], #64]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "ldr %q[b2a], [%[b_ptr], #96]\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "ldr %q[b3a], [%[b_ptr], #112]\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "add %[b_ptr], %[b_ptr], #128\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #80]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "smlal2 v13.8h, v2.16b, %[b1].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
ASM_PREFETCH("[%[a_ptr], #320]")
- "smlal2 v15.8h, v2.16b, %[b3].16b\n"
- "ldr q2, [%[a_ptr], #96]\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "smull v14.8h, v3.8b, %[b2].8b\n"
- "smull v15.8h, v3.8b, %[b3].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0].16b\n"
- "ldr %q[b0], [%[b_ptr], #0]\n"
- "smlal2 v13.8h, v3.16b, %[b1].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2].16b\n"
- "smlal2 v15.8h, v3.16b, %[b3].16b\n"
- "ldr q3, [%[a_ptr], #112]\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #96]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "ldr %q[b0], [%[b_ptr], #0]\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr q3, [%[a_ptr], #112]\n"
// Unroll 1
- "sadalp v28.4s, v12.8h\n"
- "smull v12.8h, v0.8b, %[b0a].8b\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "smull v13.8h, v0.8b, %[b1a].8b\n"
- "sadalp v31.4s, v15.8h\n"
- "smull v14.8h, v0.8b, %[b2a].8b\n"
- "smull v15.8h, v0.8b, %[b3a].8b\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
- "ldr q0, [%[a_ptr], #128]\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0a].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1a].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "add %[a_ptr], %[a_ptr], #128\n"
- "smull v14.8h, v1.8b, %[b2a].8b\n"
- "smull v15.8h, v1.8b, %[b3a].8b\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
- "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
- "ldr q1, [%[a_ptr], #16]\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0a].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1a].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "smull v14.8h, v2.8b, %[b2a].8b\n"
- "smull v15.8h, v2.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0a].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1a].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "smull v14.8h, v0.8b, %[b2a].8b\n"
+ "smull v15.8h, v0.8b, %[b3a].8b\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
+ "ldr q0, [%[a_ptr], #128]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0a].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1a].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "smull v14.8h, v1.8b, %[b2a].8b\n"
+ "smull v15.8h, v1.8b, %[b3a].8b\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0a].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1a].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2a].8b\n"
+ "smull v15.8h, v2.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
+ "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
- "ldr q2, [%[a_ptr], #32]\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0a].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1a].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "smull v14.8h, v3.8b, %[b2a].8b\n"
- "smull v15.8h, v3.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
- "ldr q3, [%[a_ptr], #48]\n"
+ "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0a].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1a].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2a].8b\n"
+ "smull v15.8h, v3.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
+ "ldr q3, [%[a_ptr], #48]\n"
// Start of unroll 0 for next iteration.
- "sadalp v28.4s, v12.8h\n"
- "smull v12.8h, v0.8b, %[b0].8b\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "smull v13.8h, v0.8b, %[b1].8b\n"
- "sadalp v31.4s, v15.8h\n"
- "bne 1b\n"
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "bne 1b\n"
// Target to use when K=1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
- "smull v14.8h, v0.8b, %[b2].8b\n"
- "smull v15.8h, v0.8b, %[b3].8b\n"
- "ldr %q[b0a], [%[b_ptr], #64]\n"
- "smlal2 v12.8h, v0.16b, %[b0].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1].16b\n"
- "ldr %q[b1a], [%[b_ptr], #80]\n"
- "smlal2 v14.8h, v0.16b, %[b2].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3].16b\n"
- "ldr q0, [%[a_ptr], #64]\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2].8b\n"
- "ldr %q[b2a], [%[b_ptr], #96]\n"
- "smull v15.8h, v1.8b, %[b3].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0].16b\n"
- "ldr %q[b3a], [%[b_ptr], #112]\n"
- "smlal2 v13.8h, v1.16b, %[b1].16b\n"
- "add %[b_ptr], %[b_ptr], #128\n"
- "smlal2 v14.8h, v1.16b, %[b2].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3].16b\n"
- "ldr q1, [%[a_ptr], #80]\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "smull v14.8h, v2.8b, %[b2].8b\n"
- "smull v15.8h, v2.8b, %[b3].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0].16b\n"
- "smlal2 v13.8h, v2.16b, %[b1].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2].16b\n"
- "smlal2 v15.8h, v2.16b, %[b3].16b\n"
- "ldr q2, [%[a_ptr], #96]\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "smull v14.8h, v3.8b, %[b2].8b\n"
- "smull v15.8h, v3.8b, %[b3].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0].16b\n"
- "smlal2 v13.8h, v3.16b, %[b1].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2].16b\n"
- "smlal2 v15.8h, v3.16b, %[b3].16b\n"
- "ldr q3, [%[a_ptr], #112]\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "ldr %q[b0a], [%[b_ptr], #64]\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "ldr %q[b1a], [%[b_ptr], #80]\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr], #64]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "ldr %q[b2a], [%[b_ptr], #96]\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "ldr %q[b3a], [%[b_ptr], #112]\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "add %[b_ptr], %[b_ptr], #128\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #80]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #96]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr q3, [%[a_ptr], #112]\n"
// Unroll 1
- "sadalp v28.4s, v12.8h\n"
- "smull v12.8h, v0.8b, %[b0a].8b\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "smull v13.8h, v0.8b, %[b1a].8b\n"
- "sadalp v31.4s, v15.8h\n"
- "smull v14.8h, v0.8b, %[b2a].8b\n"
- "add %[a_ptr], %[a_ptr], #128\n"
- "smull v15.8h, v0.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0a].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1a].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2a].8b\n"
- "smull v15.8h, v1.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0a].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1a].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smull v14.8h, v2.8b, %[b2a].8b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
- "smull v15.8h, v2.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
- "str q16, [%[c_ptr]]\n"
- "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0a].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1a].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smull v14.8h, v3.8b, %[b2a].8b\n"
- "addp v20.4s, v24.4s, v25.4s\n"
- "addp v21.4s, v26.4s, v27.4s\n"
- "smull v15.8h, v3.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
- "str q17, [%[c_ptr], #16]\n"
- "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
- "b 3f\n"
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0a].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1a].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "smull v14.8h, v0.8b, %[b2a].8b\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "smull v15.8h, v0.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0a].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1a].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2a].8b\n"
+ "smull v15.8h, v1.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0a].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1a].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smull v14.8h, v2.8b, %[b2a].8b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "smull v15.8h, v2.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
+ "str q16, [%[c_ptr]]\n"
+ "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0a].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1a].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smull v14.8h, v3.8b, %[b2a].8b\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "smull v15.8h, v3.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
- "smull v14.8h, v0.8b, %[b2].8b\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "smull v15.8h, v0.8b, %[b3].8b\n"
- "add %[b_ptr], %[b_ptr], #64\n"
- "smlal2 v12.8h, v0.16b, %[b0].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1].16b\n"
- "smlal2 v14.8h, v0.16b, %[b2].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3].16b\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2].8b\n"
- "smull v15.8h, v1.8b, %[b3].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0].16b\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smlal2 v13.8h, v1.16b, %[b1].16b\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smlal2 v14.8h, v1.16b, %[b2].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3].16b\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smull v14.8h, v2.8b, %[b2].8b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
- "smull v15.8h, v2.8b, %[b3].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0].16b\n"
- "str q16, [%[c_ptr]]\n"
- "smlal2 v13.8h, v2.16b, %[b1].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2].16b\n"
- "smlal2 v15.8h, v2.16b, %[b3].16b\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smull v14.8h, v3.8b, %[b2].8b\n"
- "addp v20.4s, v24.4s, v25.4s\n"
- "addp v21.4s, v26.4s, v27.4s\n"
- "smull v15.8h, v3.8b, %[b3].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0].16b\n"
- "str q17, [%[c_ptr], #16]\n"
- "smlal2 v13.8h, v3.16b, %[b1].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2].16b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "str q16, [%[c_ptr]]\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
"3:\n"
// Final additions
- "sadalp v28.4s, v12.8h\n"
- "str q18, [%[c_ptr], #32]\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "sadalp v31.4s, v15.8h\n"
+ "sadalp v28.4s, v12.8h\n"
+ "str q18, [%[c_ptr], #32]\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "sadalp v31.4s, v15.8h\n"
// Horizontal reduction, phase 1
- "addp v22.4s, v28.4s, v29.4s\n"
- "addp v23.4s, v30.4s, v31.4s\n"
+ "addp v22.4s, v28.4s, v29.4s\n"
+ "addp v23.4s, v30.4s, v31.4s\n"
// Horizontal reduction, phase 2
- "addp v19.4s, v22.4s, v23.4s\n"
- "str q19, [%[c_ptr], #48]\n"
- "add %[c_ptr], %[c_ptr], #64\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "str q19, [%[c_ptr], #48]\n"
+ "add %[c_ptr], %[c_ptr], #64\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index d0c64b2f6c..cc4b81ecb5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -39,8 +39,7 @@ void a64_gemm_s8_8x12_x1(const int8_t *, const int8_t *, int32_t *, int, int, in
class cls_a64_gemm_s8_8x12 {
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
@@ -59,8 +58,8 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
template<typename T>
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
index fc46781100..5ba14d2409 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
@@ -111,11 +111,11 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"1:\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
"ins %[b2].d[1], x20\n"
@@ -123,7 +123,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[a_ptr], #40]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
"ins %[a0a].d[1], x20\n"
@@ -131,7 +131,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[a_ptr], #56]\n"
".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
"ins %[a1a].d[1], x20\n"
@@ -139,7 +139,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[b_ptr], #56]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
"ins %[b0].d[1], x20\n"
@@ -155,8 +155,8 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- // Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
".word 0x4f85e048 // sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
"ins %[b1].d[1], x20\n"
@@ -164,7 +164,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[b_ptr], #88]\n"
".word 0x4f85e84a // sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
".word 0x4fa5e84b // sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
".word 0x4f86e04c // sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
"ins %[b2].d[1], x20\n"
@@ -172,7 +172,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[a_ptr], #72]\n"
".word 0x4f86e84e // sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
".word 0x4fa6e84f // sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
".word 0x4f85e070 // sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
"ins %[a0].d[1], x20\n"
@@ -180,7 +180,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[a_ptr], #88]\n"
".word 0x4f85e872 // sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
".word 0x4fa5e873 // sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
".word 0x4f86e074 // sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
"ins %[a1].d[1], x20\n"
@@ -188,7 +188,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[b_ptr], #104]\n"
".word 0x4f86e876 // sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
".word 0x4fa6e877 // sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
".word 0x4f85e098 // sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
"ins %[b0].d[1], x20\n"
@@ -196,19 +196,19 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[b_ptr], #120]\n"
".word 0x4f85e89a // sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
".word 0x4fa5e89b // sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x4f86e09c // sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
ASM_PREFETCH("[%[b_ptr], #640]")
".word 0x4fa6e09d // sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x4f86e89e // sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
"ins %[b1].d[1], x20\n"
".word 0x4fa6e89f // sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
"ldr %d[b2], [%[b_ptr], #32]\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "b.ne 1b\n"
+ "b.ne 1b\n"
// Branch here if K=1 or 2. Do the right thing for odd/even at the end.
"4:\n"
@@ -221,7 +221,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
// Even K continuation
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
"ins %[b2].d[1], x20\n"
@@ -230,7 +230,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
ASM_PREFETCHW("[%[c_ptr]]")
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
"ins %[a0a].d[1], x20\n"
@@ -238,7 +238,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
"ldr x20, [%[a_ptr], #56]\n"
".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
"ins %[a1a].d[1], x20\n"
@@ -253,7 +253,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
ASM_PREFETCHW("[%[c_ptr], #128]")
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
"ins %[b0].d[1], x20\n"
@@ -262,7 +262,7 @@ void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
ASM_PREFETCHW("[%[c_ptr], #192]")
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
".word 0x4f85e048 // sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
"ins %[b1].d[1], x20\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
index f25947da26..98bff14104 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
@@ -53,63 +53,63 @@ void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpane
register int32x4_t a1a asm("v6");
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -120,7 +120,7 @@ void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpane
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
@@ -131,19 +131,19 @@ void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpane
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x4f85e048 // sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
".word 0x4fa5e049 // sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr %q[a0], [%[a_ptr], #64]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
".word 0x4f85e84a // sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
".word 0x4fa5e84b // sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
".word 0x4f86e04c // sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "ldr %q[a1], [%[a_ptr], #80]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
".word 0x4fa6e04d // sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
".word 0x4f86e84e // sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
".word 0x4fa6e84f // sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
".word 0x4f85e070 // sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
".word 0x4fa5e071 // sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
@@ -154,40 +154,40 @@ void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpane
".word 0x4fa6e075 // sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
".word 0x4f86e876 // sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
".word 0x4fa6e877 // sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
".word 0x4f85e098 // sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
".word 0x4fa5e099 // sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x4f85e89a // sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
".word 0x4fa5e89b // sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x4f86e09c // sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
".word 0x4fa6e09d // sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
".word 0x4f86e89e // sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
".word 0x4fa6e89f // sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "bne 1b\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -197,140 +197,140 @@ void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpane
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x4f85e048 // sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
".word 0x4f85e070 // sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x4fa5e049 // sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x4fa5e071 // sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x4f85e098 // sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "str q24, [%[c_ptr], #32]\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x4fa5e099 // sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x4f85e84a // sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x4f85e872 // sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x4f85e89a // sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x4fa5e84b // sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x4fa5e873 // sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x4fa5e89b // sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x4f86e04c // sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x4f86e074 // sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x4f86e09c // sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x4fa6e04d // sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x4fa6e075 // sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x4fa6e09d // sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x4f86e84e // sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x4f86e876 // sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x4f86e89e // sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x4fa6e84f // sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x4fa6e877 // sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x4fa6e89f // sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
- "b 3f\n"
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
// Common tail
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
index 30f819d45e..1f9d001553 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
@@ -52,60 +52,60 @@ void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cp
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -116,74 +116,74 @@ void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cp
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %q[a0], [%[a_ptr], #32]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[a1], [%[a_ptr], #48]\n"
+ "ldr %q[a1], [%[a_ptr], #48]\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
ASM_PREFETCH("[%[b_ptr], #512]")
".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %q[a0], [%[a_ptr]]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "bne 1b\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -193,142 +193,142 @@ void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cp
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %q[a0], [%[a_ptr], #-32]\n"
+ "ldr %q[a0], [%[a_ptr], #-32]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[a1], [%[a_ptr], #-16]\n"
+ "ldr %q[a1], [%[a_ptr], #-16]\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "str q24, [%[c_ptr], #32]\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
- "b 3f\n"
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
// Common tail
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a1] "+w" (a1),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
index af13fbd4e9..c7295275e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
@@ -34,8 +34,7 @@ void a64_gemm_u16_asimd_8x12(const uint16_t *, const uint16_t *, uint32_t *, int
class cls_a64_gemm_u16_8x12 {
public:
- typedef uint16_t lhs_operand_type;
- typedef uint16_t rhs_operand_type;
+ typedef uint16_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
@@ -54,8 +53,8 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
kern_type kernel = a64_gemm_u16_asimd_8x12;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index cdc902f2cc..6d27dd73f1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -35,8 +35,7 @@ void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpa
class cls_a64_gemm_u8_4x4 {
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
@@ -65,8 +64,8 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 16> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
template<typename T>
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
index c227f21702..495a81692d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
@@ -49,225 +49,225 @@ void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpa
register uint8x16_t b3 asm("v7");
__asm __volatile (
- "movi v16.4s, #0x0\n"
- "ldr q0, [%[a_ptr]]\n"
- "movi v17.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v18.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v19.4s, #0x0\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "movi v20.4s, #0x0\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%[a_ptr], #16]\n"
- "movi v22.4s, #0x0\n"
- "ldr q2, [%[a_ptr], #32]\n"
- "movi v23.4s, #0x0\n"
- "ldr q3, [%[a_ptr], #48]\n"
- "movi v24.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "movi v24.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v25.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v26.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v27.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v28.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v29.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v30.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v31.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "umull v12.8h, v0.8b, %[b0].8b\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "umull v13.8h, v0.8b, %[b1].8b\n"
- "umull v14.8h, v0.8b, %[b2].8b\n"
- "add %[b_ptr], %[b_ptr], #64\n"
- "umull v15.8h, v0.8b, %[b3].8b\n"
+ "umull v12.8h, v0.8b, %[b0].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "umull v13.8h, v0.8b, %[b1].8b\n"
+ "umull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "umull v15.8h, v0.8b, %[b3].8b\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 2f\n"
+ "cbz %w[k], 2f\n"
"1:\n"
- "uadalp v16.4s, v12.8h\n"
- "umull2 v12.8h, v0.16b, %[b0].16b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull2 v13.8h, v0.16b, %[b1].16b\n"
- "uadalp v18.4s, v14.8h\n"
- "umull2 v14.8h, v0.16b, %[b2].16b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull2 v15.8h, v0.16b, %[b3].16b\n"
- "ldr q0, [%[a_ptr]]\n"
-
- "uadalp v16.4s, v12.8h\n"
- "umull v12.8h, v1.8b, %[b0].8b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull v13.8h, v1.8b, %[b1].8b\n"
- "subs %w[k], %w[k], #1\n"
- "uadalp v18.4s, v14.8h\n"
- "umull v14.8h, v1.8b, %[b2].8b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull v15.8h, v1.8b, %[b3].8b\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull2 v12.8h, v1.16b, %[b0].16b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull2 v13.8h, v1.16b, %[b1].16b\n"
+ "uadalp v16.4s, v12.8h\n"
+ "umull2 v12.8h, v0.16b, %[b0].16b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull2 v13.8h, v0.16b, %[b1].16b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull2 v14.8h, v0.16b, %[b2].16b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr]]\n"
+
+ "uadalp v16.4s, v12.8h\n"
+ "umull v12.8h, v1.8b, %[b0].8b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull v13.8h, v1.8b, %[b1].8b\n"
+ "subs %w[k], %w[k], #1\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull v14.8h, v1.8b, %[b2].8b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull v15.8h, v1.8b, %[b3].8b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull2 v12.8h, v1.16b, %[b0].16b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull2 v13.8h, v1.16b, %[b1].16b\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "uadalp v22.4s, v14.8h\n"
- "umull2 v14.8h, v1.16b, %[b2].16b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull2 v15.8h, v1.16b, %[b3].16b\n"
- "ldr q1, [%[a_ptr], #16]\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull v12.8h, v2.8b, %[b0].8b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull v13.8h, v2.8b, %[b1].8b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull2 v14.8h, v1.16b, %[b2].16b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull v12.8h, v2.8b, %[b0].8b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull v13.8h, v2.8b, %[b1].8b\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "uadalp v22.4s, v14.8h\n"
- "umull v14.8h, v2.8b, %[b2].8b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull v15.8h, v2.8b, %[b3].8b\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull2 v12.8h, v2.16b, %[b0].16b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull2 v13.8h, v2.16b, %[b1].16b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull2 v14.8h, v2.16b, %[b2].16b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull2 v15.8h, v2.16b, %[b3].16b\n"
- "ldr q2, [%[a_ptr], #32]\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull v12.8h, v3.8b, %[b0].8b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull v13.8h, v3.8b, %[b1].8b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull v14.8h, v3.8b, %[b2].8b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull v15.8h, v3.8b, %[b3].8b\n"
-
- "uadalp v28.4s, v12.8h\n"
- "umull2 v12.8h, v3.16b, %[b0].16b\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "uadalp v29.4s, v13.8h\n"
- "umull2 v13.8h, v3.16b, %[b1].16b\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "uadalp v30.4s, v14.8h\n"
- "umull2 v14.8h, v3.16b, %[b2].16b\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "uadalp v31.4s, v15.8h\n"
- "umull2 v15.8h, v3.16b, %[b3].16b\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
-
- "uadalp v28.4s, v12.8h\n"
- "umull v12.8h, v0.8b, %[b0].8b\n"
- "add %[b_ptr], %[b_ptr], #64\n"
- "uadalp v29.4s, v13.8h\n"
- "umull v13.8h, v0.8b, %[b1].8b\n"
- "ldr q3, [%[a_ptr], #48]\n"
- "uadalp v30.4s, v14.8h\n"
- "umull v14.8h, v0.8b, %[b2].8b\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "uadalp v31.4s, v15.8h\n"
- "umull v15.8h, v0.8b, %[b3].8b\n"
- "bne 1b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull v14.8h, v2.8b, %[b2].8b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull v15.8h, v2.8b, %[b3].8b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull2 v12.8h, v2.16b, %[b0].16b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull2 v13.8h, v2.16b, %[b1].16b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull2 v14.8h, v2.16b, %[b2].16b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull v12.8h, v3.8b, %[b0].8b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull v13.8h, v3.8b, %[b1].8b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull v14.8h, v3.8b, %[b2].8b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull v15.8h, v3.8b, %[b3].8b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull2 v12.8h, v3.16b, %[b0].16b\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull2 v13.8h, v3.16b, %[b1].16b\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull2 v14.8h, v3.16b, %[b2].16b\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull v12.8h, v0.8b, %[b0].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull v13.8h, v0.8b, %[b1].8b\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull v15.8h, v0.8b, %[b3].8b\n"
+ "bne 1b\n"
// Branch target
"2:\n"
- "uadalp v16.4s, v12.8h\n"
- "umull2 v12.8h, v0.16b, %[b0].16b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull2 v13.8h, v0.16b, %[b1].16b\n"
- "uadalp v18.4s, v14.8h\n"
- "umull2 v14.8h, v0.16b, %[b2].16b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull2 v15.8h, v0.16b, %[b3].16b\n"
-
- "uadalp v16.4s, v12.8h\n"
- "umull v12.8h, v1.8b, %[b0].8b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull v13.8h, v1.8b, %[b1].8b\n"
- "uadalp v18.4s, v14.8h\n"
- "umull v14.8h, v1.8b, %[b2].8b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull v15.8h, v1.8b, %[b3].8b\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull2 v12.8h, v1.16b, %[b0].16b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull2 v13.8h, v1.16b, %[b1].16b\n"
- "uadalp v22.4s, v14.8h\n"
- "umull2 v14.8h, v1.16b, %[b2].16b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull2 v15.8h, v1.16b, %[b3].16b\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull v12.8h, v2.8b, %[b0].8b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull v13.8h, v2.8b, %[b1].8b\n"
- "uadalp v22.4s, v14.8h\n"
- "umull v14.8h, v2.8b, %[b2].8b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull v15.8h, v2.8b, %[b3].8b\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull2 v12.8h, v2.16b, %[b0].16b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull2 v13.8h, v2.16b, %[b1].16b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull2 v14.8h, v2.16b, %[b2].16b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull2 v15.8h, v2.16b, %[b3].16b\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull v12.8h, v3.8b, %[b0].8b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull v13.8h, v3.8b, %[b1].8b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull v14.8h, v3.8b, %[b2].8b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull v15.8h, v3.8b, %[b3].8b\n"
-
- "uadalp v28.4s, v12.8h\n"
- "umull2 v12.8h, v3.16b, %[b0].16b\n"
- "uadalp v29.4s, v13.8h\n"
- "umull2 v13.8h, v3.16b, %[b1].16b\n"
- "uadalp v30.4s, v14.8h\n"
- "umull2 v14.8h, v3.16b, %[b2].16b\n"
- "uadalp v31.4s, v15.8h\n"
- "umull2 v15.8h, v3.16b, %[b3].16b\n"
-
- "uadalp v28.4s, v12.8h\n"
- "uadalp v29.4s, v13.8h\n"
- "uadalp v30.4s, v14.8h\n"
- "uadalp v31.4s, v15.8h\n"
-
- "addp v16.4s, v16.4s, v17.4s\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
- "addp v20.4s, v24.4s, v25.4s\n"
- "addp v21.4s, v26.4s, v27.4s\n"
- "addp v22.4s, v28.4s, v29.4s\n"
- "addp v23.4s, v30.4s, v31.4s\n"
-
- "addp v16.4s, v16.4s, v17.4s\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
-
- "str q16, [%[c_ptr]]\n"
- "str q17, [%[c_ptr], #16]\n"
- "str q18, [%[c_ptr], #32]\n"
- "str q19, [%[c_ptr], #48]\n"
- "add %[c_ptr], %[c_ptr], #64\n"
+ "uadalp v16.4s, v12.8h\n"
+ "umull2 v12.8h, v0.16b, %[b0].16b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull2 v13.8h, v0.16b, %[b1].16b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull2 v14.8h, v0.16b, %[b2].16b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull2 v15.8h, v0.16b, %[b3].16b\n"
+
+ "uadalp v16.4s, v12.8h\n"
+ "umull v12.8h, v1.8b, %[b0].8b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull v13.8h, v1.8b, %[b1].8b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull v14.8h, v1.8b, %[b2].8b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull v15.8h, v1.8b, %[b3].8b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull2 v12.8h, v1.16b, %[b0].16b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull2 v13.8h, v1.16b, %[b1].16b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull2 v14.8h, v1.16b, %[b2].16b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull2 v15.8h, v1.16b, %[b3].16b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull v12.8h, v2.8b, %[b0].8b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull v13.8h, v2.8b, %[b1].8b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull v14.8h, v2.8b, %[b2].8b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull v15.8h, v2.8b, %[b3].8b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull2 v12.8h, v2.16b, %[b0].16b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull2 v13.8h, v2.16b, %[b1].16b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull2 v14.8h, v2.16b, %[b2].16b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull2 v15.8h, v2.16b, %[b3].16b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull v12.8h, v3.8b, %[b0].8b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull v13.8h, v3.8b, %[b1].8b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull v14.8h, v3.8b, %[b2].8b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull v15.8h, v3.8b, %[b3].8b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull2 v12.8h, v3.16b, %[b0].16b\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull2 v13.8h, v3.16b, %[b1].16b\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull2 v14.8h, v3.16b, %[b2].16b\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull2 v15.8h, v3.16b, %[b3].16b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "uadalp v29.4s, v13.8h\n"
+ "uadalp v30.4s, v14.8h\n"
+ "uadalp v31.4s, v15.8h\n"
+
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "addp v22.4s, v28.4s, v29.4s\n"
+ "addp v23.4s, v30.4s, v31.4s\n"
+
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+
+ "str q16, [%[c_ptr]]\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "str q18, [%[c_ptr], #32]\n"
+ "str q19, [%[c_ptr], #48]\n"
+ "add %[c_ptr], %[c_ptr], #64\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index f4e43407df..37f6dde3ab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -37,8 +37,7 @@ void a64_gemm_u8_8x12_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int,
class cls_a64_gemm_u8_8x12 {
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
@@ -67,8 +66,8 @@ public:
}
// Use the standard fixed sized transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
template<typename T>
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
index 8203db21d0..7d433d1a26 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
@@ -111,11 +111,11 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"1:\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
"ins %[b2].d[1], x20\n"
@@ -123,7 +123,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[a_ptr], #40]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
"ins %[a0a].d[1], x20\n"
@@ -131,7 +131,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[a_ptr], #56]\n"
".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
"ins %[a1a].d[1], x20\n"
@@ -139,7 +139,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[b_ptr], #56]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
"ins %[b0].d[1], x20\n"
@@ -155,8 +155,8 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- // Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
".word 0x6f85e048 // udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
"ins %[b1].d[1], x20\n"
@@ -164,7 +164,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[b_ptr], #88]\n"
".word 0x6f85e84a // udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
".word 0x6fa5e84b // udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
".word 0x6f86e04c // udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
"ins %[b2].d[1], x20\n"
@@ -172,7 +172,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[a_ptr], #72]\n"
".word 0x6f86e84e // udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
".word 0x6fa6e84f // udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
".word 0x6f85e070 // udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
"ins %[a0].d[1], x20\n"
@@ -180,7 +180,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[a_ptr], #88]\n"
".word 0x6f85e872 // udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
".word 0x6fa5e873 // udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
".word 0x6f86e074 // udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
"ins %[a1].d[1], x20\n"
@@ -188,7 +188,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[b_ptr], #104]\n"
".word 0x6f86e876 // udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
".word 0x6fa6e877 // udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
".word 0x6f85e098 // udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
"ins %[b0].d[1], x20\n"
@@ -196,19 +196,19 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[b_ptr], #120]\n"
".word 0x6f85e89a // udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
".word 0x6fa5e89b // udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x6f86e09c // udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
ASM_PREFETCH("[%[b_ptr], #640]")
".word 0x6fa6e09d // udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x6f86e89e // udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
"ins %[b1].d[1], x20\n"
".word 0x6fa6e89f // udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
"ldr %d[b2], [%[b_ptr], #32]\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "b.ne 1b\n"
+ "b.ne 1b\n"
// Branch here if K=1 or 2. Do the right thing for odd/even at the end.
"4:\n"
@@ -221,7 +221,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
// Even K continuation
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
"ins %[b2].d[1], x20\n"
@@ -230,7 +230,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
ASM_PREFETCHW("[%[c_ptr]]")
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
"ins %[a0a].d[1], x20\n"
@@ -238,7 +238,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
"ldr x20, [%[a_ptr], #56]\n"
".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
"ins %[a1a].d[1], x20\n"
@@ -253,7 +253,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
ASM_PREFETCHW("[%[c_ptr], #128]")
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
"ins %[b0].d[1], x20\n"
@@ -262,7 +262,7 @@ void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
ASM_PREFETCHW("[%[c_ptr], #192]")
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
".word 0x6f85e048 // udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
"ins %[b1].d[1], x20\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
index 956ad9448e..1c5e8472e6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
@@ -53,63 +53,63 @@ void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cp
register uint8x16_t a1a asm("v6");
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -120,7 +120,7 @@ void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cp
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
@@ -131,19 +131,19 @@ void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cp
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x6f85e048 // udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
".word 0x6fa5e049 // udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr %q[a0], [%[a_ptr], #64]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
".word 0x6f85e84a // udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
".word 0x6fa5e84b // udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
".word 0x6f86e04c // udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "ldr %q[a1], [%[a_ptr], #80]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
".word 0x6fa6e04d // udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
".word 0x6f86e84e // udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
".word 0x6fa6e84f // udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
".word 0x6f85e070 // udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
".word 0x6fa5e071 // udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
@@ -154,40 +154,40 @@ void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cp
".word 0x6fa6e075 // udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
".word 0x6f86e876 // udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
".word 0x6fa6e877 // udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
".word 0x6f85e098 // udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
".word 0x6fa5e099 // udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x6f85e89a // udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
".word 0x6fa5e89b // udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x6f86e09c // udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
".word 0x6fa6e09d // udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
".word 0x6f86e89e // udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
".word 0x6fa6e89f // udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "bne 1b\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -197,140 +197,140 @@ void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cp
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x6f85e048 // udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
".word 0x6f85e070 // udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x6fa5e049 // udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x6fa5e071 // udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x6f85e098 // udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "str q24, [%[c_ptr], #32]\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x6fa5e099 // udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x6f85e84a // udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x6f85e872 // udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x6f85e89a // udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x6fa5e84b // udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x6fa5e873 // udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x6fa5e89b // udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x6f86e04c // udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x6f86e074 // udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x6f86e09c // udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x6fa6e04d // udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x6fa6e075 // udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x6fa6e09d // udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x6f86e84e // udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x6f86e876 // udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x6f86e89e // udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x6fa6e84f // udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x6fa6e877 // udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x6fa6e89f // udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
- "b 3f\n"
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
// Common tail
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
index 182d246d7a..63581a6008 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
@@ -52,60 +52,60 @@ void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -116,74 +116,74 @@ void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %q[a0], [%[a_ptr], #32]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[a1], [%[a_ptr], #48]\n"
+ "ldr %q[a1], [%[a_ptr], #48]\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
ASM_PREFETCH("[%[b_ptr], #512]")
".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %q[a0], [%[a_ptr]]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "bne 1b\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
@@ -193,142 +193,142 @@ void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %q[a0], [%[a_ptr], #-32]\n"
+ "ldr %q[a0], [%[a_ptr], #-32]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[a1], [%[a_ptr], #-16]\n"
+ "ldr %q[a1], [%[a_ptr], #-16]\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "str q24, [%[c_ptr], #32]\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
- "b 3f\n"
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
+ "str q8, [%[c_ptr], #0]\n"
".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
+ "str q16, [%[c_ptr], #16]\n"
".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
+ "str q9, [%[c_ptr], #48]\n"
".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
+ "str q17, [%[c_ptr], #64]\n"
".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
+ "str q25, [%[c_ptr], #80]\n"
".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
+ "str q10, [%[c_ptr], #96]\n"
".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
+ "str q18, [%[c_ptr], #112]\n"
".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
+ "str q26, [%[c_ptr], #128]\n"
".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
+ "str q11, [%[c_ptr], #144]\n"
".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
+ "str q19, [%[c_ptr], #160]\n"
".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
+ "str q27, [%[c_ptr], #176]\n"
".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q12, [%[c_ptr], #192]\n"
".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
+ "str q20, [%[c_ptr], #208]\n"
".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
+ "str q28, [%[c_ptr], #224]\n"
".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
+ "str q13, [%[c_ptr], #240]\n"
".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
+ "str q21, [%[c_ptr], #256]\n"
".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
+ "str q29, [%[c_ptr], #272]\n"
".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
+ "str q14, [%[c_ptr], #288]\n"
".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
+ "str q22, [%[c_ptr], #304]\n"
".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
+ "str q30, [%[c_ptr], #320]\n"
".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q15, [%[c_ptr], #336]\n"
// Common tail
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index 6bc40b4ac8..d9668aae02 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -41,8 +41,7 @@ void a64_hgemm_asimd_8x24_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int,
// the constructor to pick a kernel implementation).
class cls_a64_hgemm_8x24 {
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
@@ -61,7 +60,7 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 24> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
template<typename T>
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 742b406438..0686589f5c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -22,8 +22,8 @@
* SOFTWARE.
*/
-// Build on AArch64 where either ENABLE_FP16_KERNELS is set or FP16 is explicitly supported.
-#if defined(__aarch64__) && (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include <arm_neon.h>
@@ -72,311 +72,311 @@ void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.8h, #0x0\n"
- "ldr %d[a0], [%[a_ptr]]\n"
- "movi v9.8h, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.8h, #0x0\n"
- "ldr %d[a1], [%[a_ptr], #8]\n"
- "movi v11.8h, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.8h, #0x0\n"
- "movi v13.8h, #0x0\n"
+ "movi v8.8h, #0x0\n"
+ "ldr %d[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %d[a1], [%[a_ptr], #8]\n"
+ "movi v11.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.8h, #0x0\n"
+ "movi v13.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v14.8h, #0x0\n"
- "movi v15.8h, #0x0\n"
+ "movi v14.8h, #0x0\n"
+ "movi v15.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v16.8h, #0x0\n"
- "movi v17.8h, #0x0\n"
+ "movi v16.8h, #0x0\n"
+ "movi v17.8h, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v18.8h, #0x0\n"
- "movi v19.8h, #0x0\n"
+ "movi v18.8h, #0x0\n"
+ "movi v19.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v20.8h, #0x0\n"
- "movi v21.8h, #0x0\n"
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v22.8h, #0x0\n"
- "movi v23.8h, #0x0\n"
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v24.8h, #0x0\n"
- "movi v25.8h, #0x0\n"
- "movi v26.8h, #0x0\n"
- "movi v27.8h, #0x0\n"
- "movi v28.8h, #0x0\n"
- "movi v29.8h, #0x0\n"
- "movi v30.8h, #0x0\n"
- "movi v31.8h, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
// The loop is offset by these two instructions which must
// always be executed.
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
"1:\n"
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %d[a0a], [%[a_ptr], #16]\n"
-
- "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
- "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
- "ldr %d[a1a], [%[a_ptr], #24]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #16]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #24]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
- "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
- "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
// Unroll 1
- "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
- "ldr %d[a0], [%[a_ptr], #32]\n"
-
- "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
- "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
- "ldr %d[a1], [%[a_ptr], #40]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
- "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
-
- "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
-
- "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "ldr %d[a0], [%[a_ptr], #32]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
+ "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
+ "ldr %d[a1], [%[a_ptr], #40]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+
+ "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
- "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "bne 1b\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "bne 1b\n"
"4:\n"
// Start final iteration - branch off to "odd" code before we load a0a
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "cbnz %w[oddk], 2f\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "cbnz %w[oddk], 2f\n"
// Even K continuation
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %d[a0a], [%[a_ptr], #16]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #16]\n"
- "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
ASM_PREFETCHW("[%[c_ptr]]")
- "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
- "ldr %d[a1a], [%[a_ptr], #24]\n"
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #24]\n"
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
ASM_PREFETCHW("[%[c_ptr], #64]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
ASM_PREFETCHW("[%[c_ptr], #128]")
- "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
- "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
ASM_PREFETCHW("[%[c_ptr], #192]")
- "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
- "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
ASM_PREFETCHW("[%[c_ptr], #256]")
- "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
+ "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
ASM_PREFETCHW("[%[c_ptr], #320]")
- "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
- "ldr %d[a1], [%[a_ptr], #40]\n"
+ "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
+ "ldr %d[a1], [%[a_ptr], #40]\n"
- "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #448]")
- "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
+ "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #512]")
- "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
+ "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #576]")
- "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #640]")
- "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #704]")
- "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
- "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
- "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
- "b 3f\n"
+ "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
+ "b 3f\n"
"2:\n"
// Odd tail
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
ASM_PREFETCHW("[%[c_ptr]]")
- "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
ASM_PREFETCHW("[%[c_ptr], #64]")
- "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
- "add %[a_ptr], %[a_ptr], #16\n"
- "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
ASM_PREFETCHW("[%[c_ptr], #128]")
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
ASM_PREFETCHW("[%[c_ptr], #192]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
ASM_PREFETCHW("[%[c_ptr], #256]")
- "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
ASM_PREFETCHW("[%[c_ptr], #320]")
- "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #448]")
- "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
ASM_PREFETCHWL2("[%[c_ptr], #512]")
- "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #576]")
- "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
ASM_PREFETCHWL2("[%[c_ptr], #640]")
- "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #704]")
// Common tail
// A55 won't dual issue these stores with anything else, so
// simplest to do them all in this common code.
"3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
"5:\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "=w" (a0), [a0a] "=w" (a0a), [a1] "=w" (a1), [a1a] "=w" (a1a),
@@ -391,4 +391,4 @@ void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp
} // namespace arm_gemm
-#endif // __aarch64__ && (ENABLE_FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index be97c0d2d7..be92554dce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -22,8 +22,8 @@
* SOFTWARE.
*/
-// Build on AArch64 where either ENABLE_FP16_KERNELS is set or FP16 is explicitly supported.
-#if defined(__aarch64__) && (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include <arm_neon.h>
@@ -67,270 +67,270 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.8h, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.8h, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.8h, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v11.8h, #0x0\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "movi v12.8h, #0x0\n"
- "ldr %q[b0a], [%[b_ptr], #48]\n"
- "movi v13.8h, #0x0\n"
- "ldr %q[b1a], [%[b_ptr], #64]\n"
- "movi v14.8h, #0x0\n"
+ "movi v8.8h, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v11.8h, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v12.8h, #0x0\n"
+ "ldr %q[b0a], [%[b_ptr], #48]\n"
+ "movi v13.8h, #0x0\n"
+ "ldr %q[b1a], [%[b_ptr], #64]\n"
+ "movi v14.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v15.8h, #0x0\n"
+ "movi v15.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v16.8h, #0x0\n"
+ "movi v16.8h, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v17.8h, #0x0\n"
+ "movi v17.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v18.8h, #0x0\n"
+ "movi v18.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v19.8h, #0x0\n"
+ "movi v19.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.8h, #0x0\n"
- "movi v21.8h, #0x0\n"
- "movi v22.8h, #0x0\n"
- "movi v23.8h, #0x0\n"
- "movi v24.8h, #0x0\n"
- "movi v25.8h, #0x0\n"
- "movi v26.8h, #0x0\n"
- "movi v27.8h, #0x0\n"
- "movi v28.8h, #0x0\n"
- "movi v29.8h, #0x0\n"
- "movi v30.8h, #0x0\n"
- "movi v31.8h, #0x0\n"
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
"1:\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr %q[a0a], [%[a_ptr], #16]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %q[b2a], [%[b_ptr], #80]\n"
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2a], [%[b_ptr], #80]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
ASM_PREFETCH("[%[b_ptr], #288]")
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
- "ldr %q[a0], [%[a_ptr], #32]\n"
-
- "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
- "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
- "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
- "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
- "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
- "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
- "ldr %q[b0a], [%[b_ptr], #48]\n"
-
- "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
- "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+
+ "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+ "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+ "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+ "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+ "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+ "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+ "ldr %q[b0a], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+ "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
ASM_PREFETCH("[%[b_ptr], #352]")
- "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
- "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
- "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
- "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
- "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
- "ldr %q[b1a], [%[b_ptr], #64]\n"
-
- "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
- "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
- "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
- "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
- "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-
- "bne 1b\n"
+ "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+ "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+ "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+ "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+ "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+ "ldr %q[b1a], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+ "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+ "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+ "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+
+ "bne 1b\n"
"4:\n"
// Jump to odd tail if necessary.
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Even tail.
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
"fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr %q[a0a], [%[a_ptr], #16]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %q[b2a], [%[b_ptr], #80]\n"
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2a], [%[b_ptr], #80]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
"fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
-
- "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
- "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
- "str q8, [%[c_ptr]]\n"
- "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
- "str q16, [%[c_ptr], #16]\n"
-
- "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
- "str q17, [%[c_ptr], #64]\n"
-
- "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
- "str q18, [%[c_ptr], #112]\n"
-
- "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
- "str q19, [%[c_ptr], #160]\n"
-
- "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
- "str q12, [%[c_ptr], #192]\n"
- "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
- "str q20, [%[c_ptr], #208]\n"
-
- "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
- "str q13, [%[c_ptr], #240]\n"
- "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
- "str q21, [%[c_ptr], #256]\n"
-
- "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
- "str q14, [%[c_ptr], #288]\n"
- "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
- "str q22, [%[c_ptr], #304]\n"
-
- "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
- "str q15, [%[c_ptr], #336]\n"
- "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
- "b 3f\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+ "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+ "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+ "b 3f\n"
// Odd tail
"2:\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "add %[a_ptr], %[a_ptr], #16\n"
- "str q8, [%[c_ptr]]\n"
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "str q16, [%[c_ptr], #16]\n"
-
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "str q17, [%[c_ptr], #64]\n"
-
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "str q18, [%[c_ptr], #112]\n"
-
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "str q19, [%[c_ptr], #160]\n"
-
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "str q12, [%[c_ptr], #192]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "str q20, [%[c_ptr], #208]\n"
-
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "str q13, [%[c_ptr], #240]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "str q21, [%[c_ptr], #256]\n"
-
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "str q14, [%[c_ptr], #288]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "str q22, [%[c_ptr], #304]\n"
-
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "str q15, [%[c_ptr], #336]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a0a] "+w" (a0a),
@@ -346,4 +346,4 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
} // namespace arm_gemm
-#endif // __aarch64__ && (ENABLE_FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index 6e9349fac2..9aa5a2a9cc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -22,8 +22,8 @@
* SOFTWARE.
*/
-// Build on AArch64 where either ENABLE_FP16_KERNELS is set or FP16 is explicitly supported.
-#if defined(__aarch64__) && (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include <arm_neon.h>
@@ -64,271 +64,271 @@ void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.8h, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.8h, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.8h, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v11.8h, #0x0\n"
- "movi v12.8h, #0x0\n"
- "movi v13.8h, #0x0\n"
- "movi v14.8h, #0x0\n"
+ "movi v8.8h, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v11.8h, #0x0\n"
+ "movi v12.8h, #0x0\n"
+ "movi v13.8h, #0x0\n"
+ "movi v14.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v15.8h, #0x0\n"
+ "movi v15.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v16.8h, #0x0\n"
+ "movi v16.8h, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v17.8h, #0x0\n"
+ "movi v17.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v18.8h, #0x0\n"
+ "movi v18.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v19.8h, #0x0\n"
+ "movi v19.8h, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.8h, #0x0\n"
- "movi v21.8h, #0x0\n"
- "movi v22.8h, #0x0\n"
- "movi v23.8h, #0x0\n"
- "movi v24.8h, #0x0\n"
- "movi v25.8h, #0x0\n"
- "movi v26.8h, #0x0\n"
- "movi v27.8h, #0x0\n"
- "movi v28.8h, #0x0\n"
- "movi v29.8h, #0x0\n"
- "movi v30.8h, #0x0\n"
- "movi v31.8h, #0x0\n"
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
"1:\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "ldr %q[b1], [%[b_ptr], #-32]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #-32]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
ASM_PREFETCH("[%[b_ptr], #288]")
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "ldr %q[a0a], [%[a_ptr], #16]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
- "ldr %q[b2], [%[b_ptr], #-16]\n"
-
- "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
- "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
- "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
- "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n"
- "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n"
- "ldr %q[b0], [%[b_ptr]]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[b2], [%[b_ptr], #-16]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
ASM_PREFETCH("[%[b_ptr], #352]")
- "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
- "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n"
- "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n"
-
- "bne 1b\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n"
+
+ "bne 1b\n"
"4:\n"
// Jump to odd tail if necessary.
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Even tail.
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
"fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
"fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "ldr %q[b1], [%[b_ptr], #-32]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "ldr %q[a0a], [%[a_ptr], #-16]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
- "ldr %q[b2], [%[b_ptr], #-16]\n"
-
- "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
- "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
- "str q8, [%[c_ptr]]\n"
- "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
- "str q16, [%[c_ptr], #16]\n"
-
- "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
- "str q17, [%[c_ptr], #64]\n"
-
- "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
- "str q18, [%[c_ptr], #112]\n"
-
- "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
- "str q19, [%[c_ptr], #160]\n"
-
- "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n"
- "str q12, [%[c_ptr], #192]\n"
- "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n"
- "str q20, [%[c_ptr], #208]\n"
-
- "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n"
- "str q13, [%[c_ptr], #240]\n"
- "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n"
- "str q21, [%[c_ptr], #256]\n"
-
- "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n"
- "str q14, [%[c_ptr], #288]\n"
- "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n"
- "str q22, [%[c_ptr], #304]\n"
-
- "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n"
- "str q15, [%[c_ptr], #336]\n"
- "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n"
- "b 3f\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #-32]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #-16]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[b2], [%[b_ptr], #-16]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n"
+ "b 3f\n"
// Odd tail
"2:\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "add %[a_ptr], %[a_ptr], #16\n"
- "str q8, [%[c_ptr]]\n"
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "str q16, [%[c_ptr], #16]\n"
-
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "str q17, [%[c_ptr], #64]\n"
-
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "str q18, [%[c_ptr], #112]\n"
-
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "str q19, [%[c_ptr], #160]\n"
-
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "str q12, [%[c_ptr], #192]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "str q20, [%[c_ptr], #208]\n"
-
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "str q13, [%[c_ptr], #240]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "str q21, [%[c_ptr], #256]\n"
-
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "str q14, [%[c_ptr], #288]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "str q22, [%[c_ptr], #304]\n"
-
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "str q15, [%[c_ptr], #336]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a0a] "+w" (a0a),
@@ -343,4 +343,4 @@ void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16
} // namespace arm_gemm
-#endif // __aarch64__ && (ENABLE_FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
index 278d869afb..22a80885b3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 2> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 2> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
index 4494e2ac13..10f5dd4716 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -48,19 +48,18 @@ void a64_hybrid_bf16fp32_dot_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void a64_hybrid_bf16fp32_dot_6x16 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,10 +102,10 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp %x[M], #0x2\n"
"bgt 71f\n"
"beq 36f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x12, 3f\n"
"ldr q8, [x12, #0x0]\n"
@@ -190,8 +188,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov x28, #0x0\n"
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -215,10 +213,6 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q17, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
"ldr q17, [x10, #0x40]\n"
".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
@@ -243,21 +237,22 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q17, [x10, #0xe0]\n"
".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x8\n"
+ "add x26, x26, #0x10\n"
".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n"
- "ldr q6, [x10, #0x0]\n"
".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n"
"ldr q0, [x26, #0x0]\n"
+ "cmp x27, #0x10\n"
+ "add x10, x10, #0x100\n"
+ "ldr q6, [x10, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
"ldr q17, [x10, #0x40]\n"
".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
@@ -282,26 +277,29 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q17, [x10, #0xe0]\n"
".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x8\n"
".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n"
".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 24f\n"
"cmp x27, #0x2\n"
"blt 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x26], #0x4\n"
- "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ ".inst 0x4f52f208 // bfdot v8.4s, v16.8h, v18.h[0]\n"
"sub x27, x27, #0x2\n"
"ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x2\n"
- ".inst 0x4f52f228 // bfdot v8.4s, v17.8h, v18.h[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x4f52f209 // bfdot v9.4s, v16.8h, v18.h[0]\n"
+ "cmp x27, #0x2\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f52f22a // bfdot v10.4s, v17.8h, v18.h[0]\n"
".inst 0x4f52f20b // bfdot v11.4s, v16.8h, v18.h[0]\n"
+ "add x10, x10, #0x40\n"
"bge 21b\n"
"22:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 24f\n"
@@ -310,12 +308,12 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q17, [x10, #0x0]\n"
"ldr q16, [x10, #0x10]\n"
".inst 0x4f40f228 // bfdot v8.4s, v17.8h, v0.h[0]\n"
- "ldr q17, [x10, #0x20]\n"
".inst 0x4f40f209 // bfdot v9.4s, v16.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x20]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
+ "add x10, x10, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -323,9 +321,9 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"bne 15b\n"
"prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -395,95 +393,95 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"bgt 2b\n"
"b 212f\n"
"36:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"37:" // Height 2: Column loop
"cbz x12, 38f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "add x12, x12, #0x40\n"
"b 49f\n"
"38:" // Height 2: no bias
"tbz %x[flags], #0, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"bge 47f\n"
"tbz x11, #3, 42f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x11, #2, 40f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x11, #1, 39f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
"tbz x11, #0, 46f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 46f\n"
"39:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 46f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 46f\n"
"40:" // Height 2: Partial accumulate: partial_2_8
"tbz x11, #1, 41f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
"tbz x11, #0, 46f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 46f\n"
"41:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 46f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 46f\n"
"42:" // Height 2: Partial accumulate: partial_4_0
"tbz x11, #2, 44f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x11, #1, 43f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
"tbz x11, #0, 46f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 46f\n"
"43:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 46f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 46f\n"
"44:" // Height 2: Partial accumulate: partial_2_0
"tbz x11, #1, 45f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
"tbz x11, #0, 46f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 46f\n"
"45:" // Height 2: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"46:" // Height 2: Partial accumulate: Done
"sub x9, x9, x20\n"
@@ -493,10 +491,10 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 49f\n"
"48:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -511,8 +509,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov x28, #0x0\n"
"50:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 51f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -544,22 +542,22 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n"
"ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "cmp x27, #0x10\n"
".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n"
".inst 0x4f61f22c // bfdot v12.4s, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n"
".inst 0x4f61f20d // bfdot v13.4s, v16.8h, v1.h[1]\n"
"ldr q16, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n"
".inst 0x4f61f22e // bfdot v14.4s, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x80]\n"
@@ -603,18 +601,18 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n"
"ldr q17, [x10, #0x40]\n"
+ "sub x27, x27, #0x8\n"
".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n"
".inst 0x4f61f22c // bfdot v12.4s, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n"
".inst 0x4f61f20d // bfdot v13.4s, v16.8h, v1.h[1]\n"
"ldr q16, [x10, #0x70]\n"
@@ -655,18 +653,18 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr s19, [x26], #0x4\n"
"ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
+ "cmp x27, #0x2\n"
"ldr q17, [x10, #0x0]\n"
"ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x2\n"
".inst 0x4f53f228 // bfdot v8.4s, v17.8h, v19.h[0]\n"
".inst 0x4f52f22c // bfdot v12.4s, v17.8h, v18.h[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x4f53f209 // bfdot v9.4s, v16.8h, v19.h[0]\n"
".inst 0x4f52f20d // bfdot v13.4s, v16.8h, v18.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f53f22a // bfdot v10.4s, v17.8h, v19.h[0]\n"
".inst 0x4f52f22e // bfdot v14.4s, v17.8h, v18.h[0]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f53f20b // bfdot v11.4s, v16.8h, v19.h[0]\n"
".inst 0x4f52f20f // bfdot v15.4s, v16.8h, v18.h[0]\n"
"bge 56b\n"
@@ -683,9 +681,9 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f40f209 // bfdot v9.4s, v16.8h, v0.h[0]\n"
".inst 0x4f41f20d // bfdot v13.4s, v16.8h, v1.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n"
"59:" // Height 2: Multiply loop: No odd multiplies
@@ -694,13 +692,13 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x28, x20\n"
"bne 50b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 60f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -724,63 +722,63 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"tbz x11, #3, 64f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
"tbz x11, #2, 62f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
"tbz x11, #1, 61f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
+ "st1 { v15.s }[2], [x25]\n"
"b 68f\n"
"61:" // Height 2: Partial direct writeback: partial_1_12
"tbz x11, #0, 68f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
"b 68f\n"
"62:" // Height 2: Partial direct writeback: partial_2_8
"tbz x11, #1, 63f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
+ "st1 { v14.s }[2], [x25]\n"
"b 68f\n"
"63:" // Height 2: Partial direct writeback: partial_1_8
"tbz x11, #0, 68f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
"b 68f\n"
"64:" // Height 2: Partial direct writeback: partial_4_0
"tbz x11, #2, 66f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
"tbz x11, #1, 65f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
+ "st1 { v13.s }[2], [x25]\n"
"b 68f\n"
"65:" // Height 2: Partial direct writeback: partial_1_4
"tbz x11, #0, 68f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
"b 68f\n"
"66:" // Height 2: Partial direct writeback: partial_2_0
"tbz x11, #1, 67f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
+ "st1 { v12.s }[2], [x25]\n"
"b 68f\n"
"67:" // Height 2: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
"68:" // Height 2: Partial direct writeback: Done
"b 70f\n"
"69:" // Height 2: Full writeback
@@ -789,126 +787,126 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"70:" // Height 2: Writeback done
"subs x11, x11, #0x10\n"
"bgt 37b\n"
"b 212f\n"
"71:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"72:" // Height 3: Column loop
"cbz x12, 73f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"b 84f\n"
"73:" // Height 3: no bias
"tbz %x[flags], #0, 83f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 82f\n"
"tbz x11, #3, 77f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #2, 75f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #1, 74f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 81f\n"
"74:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 81f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 81f\n"
"75:" // Height 3: Partial accumulate: partial_2_8
"tbz x11, #1, 76f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 81f\n"
"76:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 81f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 81f\n"
"77:" // Height 3: Partial accumulate: partial_4_0
"tbz x11, #2, 79f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"tbz x11, #1, 78f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 81f\n"
"78:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 81f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"b 81f\n"
"79:" // Height 3: Partial accumulate: partial_2_0
"tbz x11, #1, 80f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
"b 81f\n"
"80:" // Height 3: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
"81:" // Height 3: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 84f\n"
@@ -917,14 +915,14 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 84f\n"
"83:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -943,8 +941,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov x28, #0x0\n"
"85:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 86f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -984,18 +982,18 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
"ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n"
".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n"
".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n"
"ldr q20, [x10, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f60f2a8 // bfdot v8.4s, v21.8h, v0.h[1]\n"
".inst 0x4f61f2ac // bfdot v12.4s, v21.8h, v1.h[1]\n"
".inst 0x4f62f2b0 // bfdot v16.4s, v21.8h, v2.h[1]\n"
@@ -1062,14 +1060,14 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
"ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n"
".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n"
".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n"
"ldr q20, [x10, #0x50]\n"
@@ -1128,12 +1126,12 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr s24, [x26], #0x4\n"
"ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
+ "cmp x27, #0x2\n"
"ldr s22, [x24], #0x4\n"
"ldr q21, [x10, #0x0]\n"
- "cmp x27, #0x2\n"
- "ldr q20, [x10, #0x10]\n"
".inst 0x4f58f2a8 // bfdot v8.4s, v21.8h, v24.h[0]\n"
".inst 0x4f57f2ac // bfdot v12.4s, v21.8h, v23.h[0]\n"
+ "ldr q20, [x10, #0x10]\n"
".inst 0x4f56f2b0 // bfdot v16.4s, v21.8h, v22.h[0]\n"
"ldr q21, [x10, #0x20]\n"
".inst 0x4f58f289 // bfdot v9.4s, v20.8h, v24.h[0]\n"
@@ -1177,15 +1175,15 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x28, x20\n"
"bne 85b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 95f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.4s }, [x21]\n"
"ld1r { v20.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v21.4s\n"
"fmin v9.4s, v9.4s, v21.4s\n"
@@ -1217,79 +1215,79 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"tbz x11, #3, 99f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #2, 97f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #1, 96f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 103f\n"
"96:" // Height 3: Partial direct writeback: partial_1_12
"tbz x11, #0, 103f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 103f\n"
"97:" // Height 3: Partial direct writeback: partial_2_8
"tbz x11, #1, 98f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 103f\n"
"98:" // Height 3: Partial direct writeback: partial_1_8
"tbz x11, #0, 103f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 103f\n"
"99:" // Height 3: Partial direct writeback: partial_4_0
"tbz x11, #2, 101f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x11, #1, 100f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 103f\n"
"100:" // Height 3: Partial direct writeback: partial_1_4
"tbz x11, #0, 103f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 103f\n"
"101:" // Height 3: Partial direct writeback: partial_2_0
"tbz x11, #1, 102f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 103f\n"
"102:" // Height 3: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"103:" // Height 3: Partial direct writeback: Done
"b 105f\n"
"104:" // Height 3: Full writeback
@@ -1298,39 +1296,39 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"105:" // Height 3: Writeback done
"subs x11, x11, #0x10\n"
"bgt 72b\n"
"b 212f\n"
"106:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"107:" // Height 4: Column loop
"cbz x12, 108f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1338,111 +1336,111 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"108:" // Height 4: no bias
"tbz %x[flags], #0, 118f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 117f\n"
"tbz x11, #3, 112f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x11, #2, 110f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x11, #1, 109f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 116f\n"
"109:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 116f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 116f\n"
"110:" // Height 4: Partial accumulate: partial_2_8
"tbz x11, #1, 111f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 116f\n"
"111:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 116f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 116f\n"
"112:" // Height 4: Partial accumulate: partial_4_0
"tbz x11, #2, 114f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x11, #1, 113f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 116f\n"
"113:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 116f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 116f\n"
"114:" // Height 4: Partial accumulate: partial_2_0
"tbz x11, #1, 115f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 116f\n"
"115:" // Height 4: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"116:" // Height 4: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 119f\n"
@@ -1451,18 +1449,18 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 119f\n"
"118:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -1485,8 +1483,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov x28, #0x0\n"
"120:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 121f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1626,14 +1624,14 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x8\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
"ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n"
".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n"
- "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n"
".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n"
@@ -1711,9 +1709,9 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr s29, [x26], #0x4\n"
"ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
+ "cmp x27, #0x2\n"
"ldr s27, [x24], #0x4\n"
"ldr s26, [x23], #0x4\n"
- "cmp x27, #0x2\n"
"ldr q25, [x10, #0x0]\n"
"ldr q24, [x10, #0x10]\n"
".inst 0x4f5df328 // bfdot v8.4s, v25.8h, v29.h[0]\n"
@@ -1770,17 +1768,17 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x28, x20\n"
"bne 120b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 130f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v25.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.4s }, [x21]\n"
"ld1r { v24.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v25.4s\n"
"fmin v9.4s, v9.4s, v25.4s\n"
@@ -1820,95 +1818,95 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"tbz x11, #3, 134f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
"tbz x11, #2, 132f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
"tbz x11, #1, 131f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
"b 138f\n"
"131:" // Height 4: Partial direct writeback: partial_1_12
"tbz x11, #0, 138f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
"b 138f\n"
"132:" // Height 4: Partial direct writeback: partial_2_8
"tbz x11, #1, 133f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
"b 138f\n"
"133:" // Height 4: Partial direct writeback: partial_1_8
"tbz x11, #0, 138f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
"b 138f\n"
"134:" // Height 4: Partial direct writeback: partial_4_0
"tbz x11, #2, 136f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
"tbz x11, #1, 135f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
"b 138f\n"
"135:" // Height 4: Partial direct writeback: partial_1_4
"tbz x11, #0, 138f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
"b 138f\n"
"136:" // Height 4: Partial direct writeback: partial_2_0
"tbz x11, #1, 137f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
"b 138f\n"
"137:" // Height 4: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
"138:" // Height 4: Partial direct writeback: Done
"b 140f\n"
"139:" // Height 4: Full writeback
@@ -1917,43 +1915,43 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"140:" // Height 4: Writeback done
"subs x11, x11, #0x10\n"
"bgt 107b\n"
"b 212f\n"
"141:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"142:" // Height 5: Column loop
"cbz x12, 143f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1965,128 +1963,128 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"143:" // Height 5: no bias
"tbz %x[flags], #0, 153f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 152f\n"
"tbz x11, #3, 147f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #2, 145f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #1, 144f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 151f\n"
"144:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 151f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 151f\n"
"145:" // Height 5: Partial accumulate: partial_2_8
"tbz x11, #1, 146f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 151f\n"
"146:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 151f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 151f\n"
"147:" // Height 5: Partial accumulate: partial_4_0
"tbz x11, #2, 149f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x11, #1, 148f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 151f\n"
"148:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 151f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 151f\n"
"149:" // Height 5: Partial accumulate: partial_2_0
"tbz x11, #1, 150f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 151f\n"
"150:" // Height 5: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"151:" // Height 5: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 154f\n"
@@ -2095,22 +2093,22 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 154f\n"
"153:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -2137,8 +2135,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov x28, #0x0\n"
"155:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 156f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2304,12 +2302,12 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x22, x22, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "sub x27, x27, #0x8\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
"ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n"
".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
@@ -2404,14 +2402,14 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
+ "cmp x27, #0x2\n"
"ldr s0, [x24], #0x4\n"
"ldr s31, [x23], #0x4\n"
- "cmp x27, #0x2\n"
"ldr s30, [x22], #0x4\n"
"ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
".inst 0x4f42f3a8 // bfdot v8.4s, v29.8h, v2.h[0]\n"
".inst 0x4f41f3ac // bfdot v12.4s, v29.8h, v1.h[0]\n"
+ "ldr q28, [x10, #0x10]\n"
".inst 0x4f40f3b0 // bfdot v16.4s, v29.8h, v0.h[0]\n"
".inst 0x4f5ff3b4 // bfdot v20.4s, v29.8h, v31.h[0]\n"
".inst 0x4f5ef3b8 // bfdot v24.4s, v29.8h, v30.h[0]\n"
@@ -2473,19 +2471,19 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x28, x20\n"
"bne 155b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 165f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v29.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.4s }, [x21]\n"
"ld1r { v28.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v29.4s\n"
"fmin v9.4s, v9.4s, v29.4s\n"
@@ -2533,111 +2531,111 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"tbz x11, #3, 169f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #2, 167f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #1, 166f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 173f\n"
"166:" // Height 5: Partial direct writeback: partial_1_12
"tbz x11, #0, 173f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 173f\n"
"167:" // Height 5: Partial direct writeback: partial_2_8
"tbz x11, #1, 168f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 173f\n"
"168:" // Height 5: Partial direct writeback: partial_1_8
"tbz x11, #0, 173f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 173f\n"
"169:" // Height 5: Partial direct writeback: partial_4_0
"tbz x11, #2, 171f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x11, #1, 170f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 173f\n"
"170:" // Height 5: Partial direct writeback: partial_1_4
"tbz x11, #0, 173f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 173f\n"
"171:" // Height 5: Partial direct writeback: partial_2_0
"tbz x11, #1, 172f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 173f\n"
"172:" // Height 5: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"173:" // Height 5: Partial direct writeback: Done
"b 175f\n"
"174:" // Height 5: Full writeback
@@ -2646,51 +2644,50 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"175:" // Height 5: Writeback done
"subs x11, x11, #0x10\n"
"bgt 142b\n"
"b 212f\n"
"176:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"177:" // Height 6: Column loop
"cbz x12, 178f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -2706,145 +2703,145 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"178:" // Height 6: no bias
"tbz %x[flags], #0, 188f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 187f\n"
"tbz x11, #3, 182f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x11, #2, 180f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x11, #1, 179f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 186f\n"
"179:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 186f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 186f\n"
"180:" // Height 6: Partial accumulate: partial_2_8
"tbz x11, #1, 181f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 186f\n"
"181:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 186f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 186f\n"
"182:" // Height 6: Partial accumulate: partial_4_0
"tbz x11, #2, 184f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x11, #1, 183f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 186f\n"
"183:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 186f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 186f\n"
"184:" // Height 6: Partial accumulate: partial_2_0
"tbz x11, #1, 185f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 186f\n"
"185:" // Height 6: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"186:" // Height 6: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 189f\n"
@@ -2853,26 +2850,26 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 189f\n"
"188:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -2903,8 +2900,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov x28, #0x0\n"
"190:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 191f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3094,18 +3091,18 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
"add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x8\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
"ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -3211,9 +3208,9 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr s7, [x26], #0x4\n"
"ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
+ "cmp x27, #0x2\n"
"ldr s5, [x24], #0x4\n"
"ldr s4, [x23], #0x4\n"
- "cmp x27, #0x2\n"
"ldr s3, [x22], #0x4\n"
"ldr s2, [x21], #0x4\n"
"ldr q1, [x10, #0x0]\n"
@@ -3290,21 +3287,21 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x28, x20\n"
"bne 190b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x21, x22, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 200f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v1.4s\n"
"fmin v9.4s, v9.4s, v1.4s\n"
@@ -3360,127 +3357,127 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"tbz x11, #3, 204f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
"tbz x11, #2, 202f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
"tbz x11, #1, 201f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 208f\n"
"201:" // Height 6: Partial direct writeback: partial_1_12
"tbz x11, #0, 208f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"b 208f\n"
"202:" // Height 6: Partial direct writeback: partial_2_8
"tbz x11, #1, 203f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
"b 208f\n"
"203:" // Height 6: Partial direct writeback: partial_1_8
"tbz x11, #0, 208f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
"b 208f\n"
"204:" // Height 6: Partial direct writeback: partial_4_0
"tbz x11, #2, 206f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
"tbz x11, #1, 205f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x22]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
"b 208f\n"
"205:" // Height 6: Partial direct writeback: partial_1_4
"tbz x11, #0, 208f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x22, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
"b 208f\n"
"206:" // Height 6: Partial direct writeback: partial_2_0
"tbz x11, #1, 207f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x22]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
"b 208f\n"
"207:" // Height 6: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x22, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
"208:" // Height 6: Partial direct writeback: Done
"b 210f\n"
"209:" // Height 6: Full writeback
@@ -3489,26 +3486,26 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"210:" // Height 6: Writeback done
"subs x11, x11, #0x10\n"
"bgt 177b\n"
@@ -3524,8 +3521,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"212:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
index ee57113f9b..e0a1ed62d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
index 47a85803d0..a804d5c102 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -48,19 +48,18 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,23 +102,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp %x[M], #0x2\n"
"bgt 75f\n"
"beq 38f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x12, 3f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"b 15f\n"
@@ -211,8 +209,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"16:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -232,12 +230,7 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 20f\n"
"19:" // Height 1: Multiply loop: Main loop head
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"trn1 v20.2d, v1.2d, v21.2d\n"
- "trn2 v1.2d, v1.2d, v21.2d\n"
".inst 0x6e47ee88 // bfmmla v8.4s, v20.8h, v7.8h\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x6e46ee8c // bfmmla v12.4s, v20.8h, v6.8h\n"
@@ -250,6 +243,7 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ee8e // bfmmla v14.4s, v20.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
".inst 0x6e52ee8b // bfmmla v11.4s, v20.8h, v18.8h\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x6e51ee8f // bfmmla v15.4s, v20.8h, v17.8h\n"
@@ -266,19 +260,19 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x8\n"
+ "add x26, x26, #0x10\n"
+ "cmp x27, #0x10\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
- "ldr q7, [x10, #0x0]\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
+ "add x10, x10, #0x100\n"
+ "ldr q7, [x10, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 19b\n"
"20:" // Height 1: Multiply loop: Single iteration only
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "trn1 v19.2d, v1.2d, v17.2d\n"
- "trn2 v1.2d, v1.2d, v17.2d\n"
+ "trn1 v19.2d, v1.2d, v20.2d\n"
".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
@@ -290,16 +284,17 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x60]\n"
".inst 0x6e52ee6e // bfmmla v14.4s, v19.8h, v18.8h\n"
- "ldr q25, [x10, #0x70]\n"
+ "ldr q24, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
".inst 0x6e51ee6b // bfmmla v11.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x80]\n"
- ".inst 0x6e59ee6f // bfmmla v15.4s, v19.8h, v25.8h\n"
- "ldr q3, [x10, #0x90]\n"
+ ".inst 0x6e58ee6f // bfmmla v15.4s, v19.8h, v24.8h\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x6e51ec28 // bfmmla v8.4s, v1.8h, v17.8h\n"
- "ldr q19, [x10, #0xa0]\n"
- ".inst 0x6e43ec2c // bfmmla v12.4s, v1.8h, v3.8h\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n"
"ldr q17, [x10, #0xb0]\n"
- ".inst 0x6e53ec29 // bfmmla v9.4s, v1.8h, v19.8h\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
"ldr q18, [x10, #0xc0]\n"
".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0xd0]\n"
@@ -307,21 +302,22 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"21:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 26f\n"
"cmp x27, #0x4\n"
"blt 23f\n"
"22:" // Height 1: Multiply loop: Odd block loop
"ldr d19, [x26], #0x8\n"
- "ldr q20, [x10, #0x0]\n"
- "sub x27, x27, #0x4\n"
+ "ldr q18, [x10, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
"ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- "trn1 v19.2d, v19.2d, v18.2d\n"
- ".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x30]\n"
@@ -333,9 +329,11 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x10, x10, #0x80\n"
"bge 22b\n"
"23:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 26f\n"
@@ -362,9 +360,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e42ee6e // bfmmla v14.4s, v19.8h, v2.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x10, x10, #0x80\n"
"26:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -376,9 +374,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp1 v10.2d, v10.2d, v14.2d\n"
"uzp1 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v18.4s }, [x21]\n"
"ld1r { v17.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v18.4s\n"
"fmin v9.4s, v9.4s, v18.4s\n"
@@ -448,23 +446,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"bgt 2b\n"
"b 224f\n"
"38:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"39:" // Height 2: Column loop
"cbz x12, 40f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"b 52f\n"
@@ -472,75 +470,75 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"tbz %x[flags], #0, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"bge 49f\n"
"tbz x11, #3, 44f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x11, #2, 42f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x11, #1, 41f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
"tbz x11, #0, 48f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 48f\n"
"41:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 48f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 48f\n"
"42:" // Height 2: Partial accumulate: partial_2_8
"tbz x11, #1, 43f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
"tbz x11, #0, 48f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 48f\n"
"43:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 48f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 48f\n"
"44:" // Height 2: Partial accumulate: partial_4_0
"tbz x11, #2, 46f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x11, #1, 45f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
"tbz x11, #0, 48f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 48f\n"
"45:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 48f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 48f\n"
"46:" // Height 2: Partial accumulate: partial_2_0
"tbz x11, #1, 47f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
"tbz x11, #0, 48f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 48f\n"
"47:" // Height 2: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"48:" // Height 2: Partial accumulate: Done
"sub x9, x9, x20\n"
@@ -550,10 +548,10 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"50:" // Height 2: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -577,8 +575,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"53:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 54f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -603,14 +601,6 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"blt 57f\n"
"56:" // Height 2: Multiply loop: Main loop head
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q2, [x25, #0x0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
@@ -623,6 +613,7 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
@@ -639,21 +630,22 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x8\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "ldr q2, [x25, #0x0]\n"
+ "cmp x27, #0x10\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ "add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"bge 56b\n"
"57:" // Height 2: Multiply loop: Single iteration only
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
@@ -666,6 +658,7 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
@@ -682,36 +675,41 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
+ "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x10, x10, #0x100\n"
"58:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 63f\n"
"cmp x27, #0x4\n"
"blt 60f\n"
"59:" // Height 2: Multiply loop: Odd block loop
- "ldr d20, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x4\n"
"ldr q18, [x10, #0x0]\n"
"ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- "trn1 v19.2d, v20.2d, v19.2d\n"
".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
- "ldr q26, [x10, #0x20]\n"
".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
- "ldr q6, [x10, #0x30]\n"
+ "ldr q26, [x10, #0x20]\n"
+ "ldr q5, [x10, #0x30]\n"
".inst 0x6e5aee69 // bfmmla v9.4s, v19.8h, v26.8h\n"
+ ".inst 0x6e45ee6d // bfmmla v13.4s, v19.8h, v5.8h\n"
"ldr q18, [x10, #0x40]\n"
- ".inst 0x6e46ee6d // bfmmla v13.4s, v19.8h, v6.8h\n"
"ldr q17, [x10, #0x50]\n"
".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
- "ldr q18, [x10, #0x60]\n"
".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x10, #0x60]\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "cmp x27, #0x4\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x10, x10, #0x80\n"
"bge 59b\n"
"60:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 63f\n"
@@ -736,35 +734,35 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
"ldr q30, [x10, #0x40]\n"
".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
- "ldr q31, [x10, #0x50]\n"
+ "ldr q26, [x10, #0x50]\n"
".inst 0x6e5eee6a // bfmmla v10.4s, v19.8h, v30.8h\n"
"ldr q18, [x10, #0x60]\n"
- ".inst 0x6e5fee6e // bfmmla v14.4s, v19.8h, v31.8h\n"
+ ".inst 0x6e5aee6e // bfmmla v14.4s, v19.8h, v26.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "add x10, x10, #0x80\n"
"63:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 53b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v18.4s }, [x21]\n"
"ld1r { v17.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v18.4s\n"
"fmin v12.4s, v12.4s, v18.4s\n"
@@ -788,63 +786,63 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"tbz x11, #3, 68f\n"
"st1 { v7.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
"tbz x11, #2, 66f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
"tbz x11, #1, 65f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
+ "str d11, [x25], #0x8\n"
"tbz x11, #0, 72f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
+ "st1 { v11.s }[2], [x25]\n"
"b 72f\n"
"65:" // Height 2: Partial direct writeback: partial_1_12
"tbz x11, #0, 72f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
"b 72f\n"
"66:" // Height 2: Partial direct writeback: partial_2_8
"tbz x11, #1, 67f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
+ "str d10, [x25], #0x8\n"
"tbz x11, #0, 72f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
+ "st1 { v10.s }[2], [x25]\n"
"b 72f\n"
"67:" // Height 2: Partial direct writeback: partial_1_8
"tbz x11, #0, 72f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
"b 72f\n"
"68:" // Height 2: Partial direct writeback: partial_4_0
"tbz x11, #2, 70f\n"
"st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
"tbz x11, #1, 69f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
+ "str d9, [x25], #0x8\n"
"tbz x11, #0, 72f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
+ "st1 { v9.s }[2], [x25]\n"
"b 72f\n"
"69:" // Height 2: Partial direct writeback: partial_1_4
"tbz x11, #0, 72f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
"b 72f\n"
"70:" // Height 2: Partial direct writeback: partial_2_0
"tbz x11, #1, 71f\n"
"str d7, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
+ "str d8, [x25], #0x8\n"
"tbz x11, #0, 72f\n"
"st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
+ "st1 { v8.s }[2], [x25]\n"
"b 72f\n"
"71:" // Height 2: Partial direct writeback: partial_1_0
"str s7, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
"72:" // Height 2: Partial direct writeback: Done
"b 74f\n"
"73:" // Height 2: Full writeback
@@ -853,32 +851,32 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
"74:" // Height 2: Writeback done
"subs x11, x11, #0x10\n"
"bgt 39b\n"
"b 224f\n"
"75:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"76:" // Height 3: Column loop
"cbz x12, 77f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -893,94 +891,94 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"77:" // Height 3: no bias
"tbz %x[flags], #0, 88f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 86f\n"
"tbz x11, #3, 81f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #2, 79f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
"tbz x11, #1, 78f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
"tbz x11, #0, 85f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
"b 85f\n"
"78:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 85f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
"b 85f\n"
"79:" // Height 3: Partial accumulate: partial_2_8
"tbz x11, #1, 80f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x11, #0, 85f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 85f\n"
"80:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 85f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 85f\n"
"81:" // Height 3: Partial accumulate: partial_4_0
"tbz x11, #2, 83f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #1, 82f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x11, #0, 85f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 85f\n"
"82:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 85f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 85f\n"
"83:" // Height 3: Partial accumulate: partial_2_0
"tbz x11, #1, 84f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x11, #0, 85f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 85f\n"
"84:" // Height 3: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"85:" // Height 3: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 87f\n"
@@ -989,14 +987,14 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
"87:" // Height 3: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -1036,8 +1034,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"90:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 91f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1067,38 +1065,35 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"93:" // Height 3: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
@@ -1106,12 +1101,15 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xd0]\n"
".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
@@ -1133,43 +1131,43 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"94:" // Height 3: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x90]\n"
".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
@@ -1194,25 +1192,25 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 97f\n"
"96:" // Height 3: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sub x27, x27, #0x4\n"
- "ldr d27, [x24], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
"ldr q26, [x10, #0x0]\n"
- "cmp x27, #0x4\n"
- "ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v27.2d, v29.2d\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "sub x27, x27, #0x4\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "cmp x27, #0x4\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
@@ -1221,8 +1219,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
@@ -1248,9 +1246,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
- ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n"
".inst 0x6e5def74 // bfmmla v20.4s, v27.8h, v29.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
@@ -1276,27 +1274,27 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 90b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
"uzp1 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 101f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v26.4s }, [x21]\n"
"ld1r { v25.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v26.4s\n"
"fmin v12.4s, v12.4s, v26.4s\n"
@@ -1328,79 +1326,79 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"tbz x11, #3, 105f\n"
"st1 { v7.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #2, 103f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #1, 102f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x11, #0, 109f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 109f\n"
"102:" // Height 3: Partial direct writeback: partial_1_12
"tbz x11, #0, 109f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 109f\n"
"103:" // Height 3: Partial direct writeback: partial_2_8
"tbz x11, #1, 104f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x11, #0, 109f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 109f\n"
"104:" // Height 3: Partial direct writeback: partial_1_8
"tbz x11, #0, 109f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 109f\n"
"105:" // Height 3: Partial direct writeback: partial_4_0
"tbz x11, #2, 107f\n"
"st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x11, #1, 106f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x11, #0, 109f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 109f\n"
"106:" // Height 3: Partial direct writeback: partial_1_4
"tbz x11, #0, 109f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 109f\n"
"107:" // Height 3: Partial direct writeback: partial_2_0
"tbz x11, #1, 108f\n"
"str d7, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x11, #0, 109f\n"
"st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 109f\n"
"108:" // Height 3: Partial direct writeback: partial_1_0
"str s7, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"109:" // Height 3: Partial direct writeback: Done
"b 111f\n"
"110:" // Height 3: Full writeback
@@ -1409,36 +1407,36 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"111:" // Height 3: Writeback done
"subs x11, x11, #0x10\n"
"bgt 76b\n"
"b 224f\n"
"112:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"113:" // Height 4: Column loop
"cbz x12, 114f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -1453,111 +1451,111 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"114:" // Height 4: no bias
"tbz %x[flags], #0, 125f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 123f\n"
"tbz x11, #3, 118f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x11, #2, 116f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x11, #1, 115f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x11, #0, 122f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 122f\n"
"115:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 122f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 122f\n"
"116:" // Height 4: Partial accumulate: partial_2_8
"tbz x11, #1, 117f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x11, #0, 122f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 122f\n"
"117:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 122f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 122f\n"
"118:" // Height 4: Partial accumulate: partial_4_0
"tbz x11, #2, 120f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x11, #1, 119f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x11, #0, 122f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 122f\n"
"119:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 122f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 122f\n"
"120:" // Height 4: Partial accumulate: partial_2_0
"tbz x11, #1, 121f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x11, #0, 122f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 122f\n"
"121:" // Height 4: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"122:" // Height 4: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 124f\n"
@@ -1566,18 +1564,18 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"124:" // Height 4: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -1617,8 +1615,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"127:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 128f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1652,38 +1650,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"130:" // Height 4: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "add x23, x23, #0x10\n"
+ "ldr q4, [x23, #0x0]\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
@@ -1694,18 +1687,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xd0]\n"
".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xe0]\n"
".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
@@ -1721,48 +1719,48 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"131:" // Height 4: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "sub x27, x27, #0x8\n"
- ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
- ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"ldr q25, [x10, #0x90]\n"
".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
@@ -1784,16 +1782,16 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 134f\n"
"133:" // Height 4: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x4\n"
"ldr q26, [x10, #0x0]\n"
"ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
@@ -1870,23 +1868,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 127b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
@@ -1894,9 +1892,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 138f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v26.4s }, [x21]\n"
"ld1r { v25.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v26.4s\n"
"fmin v12.4s, v12.4s, v26.4s\n"
@@ -1936,95 +1934,95 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"tbz x11, #3, 142f\n"
"st1 { v7.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
"tbz x11, #2, 140f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
"tbz x11, #1, 139f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
"tbz x11, #0, 146f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
"b 146f\n"
"139:" // Height 4: Partial direct writeback: partial_1_12
"tbz x11, #0, 146f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
"b 146f\n"
"140:" // Height 4: Partial direct writeback: partial_2_8
"tbz x11, #1, 141f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
"tbz x11, #0, 146f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
"b 146f\n"
"141:" // Height 4: Partial direct writeback: partial_1_8
"tbz x11, #0, 146f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
"b 146f\n"
"142:" // Height 4: Partial direct writeback: partial_4_0
"tbz x11, #2, 144f\n"
"st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
"tbz x11, #1, 143f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
"tbz x11, #0, 146f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
"b 146f\n"
"143:" // Height 4: Partial direct writeback: partial_1_4
"tbz x11, #0, 146f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
"b 146f\n"
"144:" // Height 4: Partial direct writeback: partial_2_0
"tbz x11, #1, 145f\n"
"str d7, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x11, #0, 146f\n"
"st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
"b 146f\n"
"145:" // Height 4: Partial direct writeback: partial_1_0
"str s7, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
"146:" // Height 4: Partial direct writeback: Done
"b 148f\n"
"147:" // Height 4: Full writeback
@@ -2033,40 +2031,40 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
"148:" // Height 4: Writeback done
"subs x11, x11, #0x10\n"
"bgt 113b\n"
"b 224f\n"
"149:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"150:" // Height 5: Column loop
"cbz x12, 151f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -2089,128 +2087,128 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"151:" // Height 5: no bias
"tbz %x[flags], #0, 162f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 160f\n"
"tbz x11, #3, 155f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #2, 153f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v27.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v27.4s }, [x22], #0x10\n"
"tbz x11, #1, 152f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d6, [x23], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d6, [x22], #0x8\n"
"tbz x11, #0, 159f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v6.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v6.s }[2], [x22]\n"
"b 159f\n"
"152:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 159f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s6, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s6, [x22, #0x0]\n"
"b 159f\n"
"153:" // Height 5: Partial accumulate: partial_2_8
"tbz x11, #1, 154f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x11, #0, 159f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 159f\n"
"154:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 159f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 159f\n"
"155:" // Height 5: Partial accumulate: partial_4_0
"tbz x11, #2, 157f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #1, 156f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x11, #0, 159f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 159f\n"
"156:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 159f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 159f\n"
"157:" // Height 5: Partial accumulate: partial_2_0
"tbz x11, #1, 158f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x11, #0, 159f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 159f\n"
"158:" // Height 5: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"159:" // Height 5: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 161f\n"
@@ -2219,22 +2217,22 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q25, [x23, #0x0]\n"
- "ldr q26, [x23, #0x10]\n"
- "ldr q27, [x23, #0x20]\n"
- "ldr q6, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q25, [x22, #0x0]\n"
+ "ldr q26, [x22, #0x10]\n"
+ "ldr q27, [x22, #0x20]\n"
+ "ldr q6, [x22, #0x30]\n"
"161:" // Height 5: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -2290,8 +2288,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"164:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 165f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2328,51 +2326,51 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"167:" // Height 5: Multiply loop: Main loop head
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x8\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "sub x27, x27, #0x8\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
- ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "add x22, x22, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x40]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x50]\n"
".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x70]\n"
".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n"
"ldr q2, [x25, #0x0]\n"
@@ -2418,47 +2416,47 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"168:" // Height 5: Multiply loop: Single iteration only
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
- ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
+ "add x22, x22, #0x10\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x50]\n"
".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n"
".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n"
"ldr q0, [x10, #0x70]\n"
".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x80]\n"
@@ -2502,24 +2500,24 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 171f\n"
"170:" // Height 5: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "cmp x27, #0x4\n"
"ldr d0, [x22], #0x8\n"
"ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v3.2d, v2.2d\n"
- "trn1 v2.2d, v0.2d, v5.2d\n"
- "ldr q0, [x10, #0x10]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
"ldr q1, [x10, #0x20]\n"
".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ "cmp x27, #0x4\n"
".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n"
@@ -2538,8 +2536,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n"
"ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n"
@@ -2610,27 +2608,27 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 164b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
@@ -2640,9 +2638,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
"tbz %x[flags], #1, 175f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v1.4s\n"
"fmin v12.4s, v12.4s, v1.4s\n"
@@ -2690,111 +2688,111 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"tbz x11, #3, 179f\n"
"st1 { v7.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #2, 177f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #1, 176f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x11, #0, 183f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 183f\n"
"176:" // Height 5: Partial direct writeback: partial_1_12
"tbz x11, #0, 183f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 183f\n"
"177:" // Height 5: Partial direct writeback: partial_2_8
"tbz x11, #1, 178f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x11, #0, 183f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 183f\n"
"178:" // Height 5: Partial direct writeback: partial_1_8
"tbz x11, #0, 183f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 183f\n"
"179:" // Height 5: Partial direct writeback: partial_4_0
"tbz x11, #2, 181f\n"
"st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x11, #1, 180f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x11, #0, 183f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 183f\n"
"180:" // Height 5: Partial direct writeback: partial_1_4
"tbz x11, #0, 183f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 183f\n"
"181:" // Height 5: Partial direct writeback: partial_2_0
"tbz x11, #1, 182f\n"
"str d7, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x11, #0, 183f\n"
"st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 183f\n"
"182:" // Height 5: Partial direct writeback: partial_1_0
"str s7, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"183:" // Height 5: Partial direct writeback: Done
"b 185f\n"
"184:" // Height 5: Full writeback
@@ -2803,48 +2801,47 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"185:" // Height 5: Writeback done
"subs x11, x11, #0x10\n"
"bgt 150b\n"
"b 224f\n"
"186:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"187:" // Height 6: Column loop
"cbz x12, 188f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -2867,145 +2864,145 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"188:" // Height 6: no bias
"tbz %x[flags], #0, 199f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 197f\n"
"tbz x11, #3, 192f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x11, #2, 190f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v27.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x11, #1, 189f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d6, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d6, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v6.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v6.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 196f\n"
"189:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 196f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s6, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s6, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 196f\n"
"190:" // Height 6: Partial accumulate: partial_2_8
"tbz x11, #1, 191f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 196f\n"
"191:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 196f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 196f\n"
"192:" // Height 6: Partial accumulate: partial_4_0
"tbz x11, #2, 194f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x11, #1, 193f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 196f\n"
"193:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 196f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 196f\n"
"194:" // Height 6: Partial accumulate: partial_2_0
"tbz x11, #1, 195f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 196f\n"
"195:" // Height 6: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"196:" // Height 6: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 198f\n"
@@ -3014,26 +3011,26 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q25, [x23, #0x0]\n"
- "ldr q26, [x23, #0x10]\n"
- "ldr q27, [x23, #0x20]\n"
- "ldr q6, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q25, [x22, #0x0]\n"
+ "ldr q26, [x22, #0x10]\n"
+ "ldr q27, [x22, #0x20]\n"
+ "ldr q6, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"198:" // Height 6: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -3089,8 +3086,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"201:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 202f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3131,56 +3128,56 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"204:" // Height 6: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x10, #0x10]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x50]\n"
".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ "cmp x27, #0x10\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x70]\n"
".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q2, [x25, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
"ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
@@ -3224,52 +3221,52 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"205:" // Height 6: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ "add x25, x25, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x10, #0x10]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x40]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ "add x21, x21, #0x10\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x50]\n"
".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x70]\n"
".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
@@ -3310,18 +3307,18 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 208f\n"
"207:" // Height 6: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d5, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x4\n"
- "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x22], #0x8\n"
"ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
"ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v3.2d\n"
- "trn1 v2.2d, v2.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
@@ -3383,9 +3380,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q0, [x10, #0x0]\n"
"trn1 v7.2d, v1.2d, v2.2d\n"
"trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n"
"trn1 v2.2d, v5.2d, v6.2d\n"
"ldr q1, [x10, #0x10]\n"
- ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n"
".inst 0x6e40ec70 // bfmmla v16.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec58 // bfmmla v24.4s, v2.8h, v0.8h\n"
"ldr q0, [x10, #0x20]\n"
@@ -3409,8 +3406,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n"
".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n"
"ldr q6, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n"
".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n"
@@ -3422,31 +3419,31 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 201b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
@@ -3458,9 +3455,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
"tbz %x[flags], #1, 212f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v7.4s, v7.4s, v1.4s\n"
"fmin v12.4s, v12.4s, v1.4s\n"
@@ -3516,127 +3513,127 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"tbz x11, #3, 216f\n"
"st1 { v7.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
- "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
"tbz x11, #2, 214f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v29.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
"tbz x11, #1, 213f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d27, [x22], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
"tbz x11, #0, 220f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v27.s }[2], [x22]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
"b 220f\n"
"213:" // Height 6: Partial direct writeback: partial_1_12
"tbz x11, #0, 220f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s27, [x22, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
"b 220f\n"
"214:" // Height 6: Partial direct writeback: partial_2_8
"tbz x11, #1, 215f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d26, [x22], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
"tbz x11, #0, 220f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v26.s }[2], [x22]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
"b 220f\n"
"215:" // Height 6: Partial direct writeback: partial_1_8
"tbz x11, #0, 220f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s26, [x22, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
"b 220f\n"
"216:" // Height 6: Partial direct writeback: partial_4_0
"tbz x11, #2, 218f\n"
"st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
"tbz x11, #1, 217f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d28, [x23], #0x8\n"
- "str d25, [x22], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d28, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
"tbz x11, #0, 220f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v28.s }[2], [x23]\n"
- "st1 { v25.s }[2], [x22]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
"b 220f\n"
"217:" // Height 6: Partial direct writeback: partial_1_4
"tbz x11, #0, 220f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s28, [x23, #0x0]\n"
- "str s25, [x22, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s28, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
"b 220f\n"
"218:" // Height 6: Partial direct writeback: partial_2_0
"tbz x11, #1, 219f\n"
"str d7, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x11, #0, 220f\n"
"st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
- "st1 { v24.s }[2], [x22]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
"b 220f\n"
"219:" // Height 6: Partial direct writeback: partial_1_0
"str s7, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
"220:" // Height 6: Partial direct writeback: Done
"b 222f\n"
"221:" // Height 6: Full writeback
@@ -3645,26 +3642,26 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
- "str q23, [x23, #0x0]\n"
- "str q28, [x23, #0x10]\n"
- "str q29, [x23, #0x20]\n"
- "str q30, [x23, #0x30]\n"
- "str q24, [x22, #0x0]\n"
- "str q25, [x22, #0x10]\n"
- "str q26, [x22, #0x20]\n"
- "str q27, [x22, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q28, [x22, #0x10]\n"
+ "str q29, [x22, #0x20]\n"
+ "str q30, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
"222:" // Height 6: Writeback done
"subs x11, x11, #0x10\n"
"bgt 187b\n"
@@ -3680,8 +3677,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"224:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
index 12244a2e99..d012e992e6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 32, 1> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 32, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
index aae6322b59..faf34f7fc0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp16_mla_6x32_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp16_mla_6x32_a55 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -103,10 +101,10 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"cmp %x[M], #0x2\n"
"bgt 99f\n"
"beq 50f\n"
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x7, 3f\n"
"ldr q8, [x7, #0x0]\n"
@@ -245,8 +243,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"mov x15, #0x0\n"
"23:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 24f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -268,153 +266,150 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"26:" // Height 1: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
"ldr d17, [x17, #0x20]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x20, [x17, #0x28]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"ldr d16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x38]\n"
- "sub x14, x14, #0x8\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "ldr x22, [x13, #0x8]\n"
- "cmp x14, #0x10\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"ldr d17, [x17, #0x40]\n"
+ "ldr x20, [x17, #0x48]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
"ldr d16, [x17, #0x50]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x58]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[1]\n"
"ldr d17, [x17, #0x60]\n"
+ "ldr x20, [x17, #0x68]\n"
"fmla v9.8h, v16.8h, v0.h[1]\n"
"ldr d16, [x17, #0x70]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x78]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[1]\n"
"ldr d17, [x17, #0x80]\n"
+ "ldr x20, [x17, #0x88]\n"
"fmla v11.8h, v16.8h, v0.h[1]\n"
"ldr d16, [x17, #0x90]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x98]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[2]\n"
"ldr d17, [x17, #0xa0]\n"
+ "ldr x20, [x17, #0xa8]\n"
"fmla v9.8h, v16.8h, v0.h[2]\n"
"ldr d16, [x17, #0xb0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0xb8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[2]\n"
"ldr d17, [x17, #0xc0]\n"
+ "ldr x20, [x17, #0xc8]\n"
"fmla v11.8h, v16.8h, v0.h[2]\n"
"ldr d16, [x17, #0xd0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0xd8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[3]\n"
"ldr d17, [x17, #0xe0]\n"
+ "ldr x20, [x17, #0xe8]\n"
"fmla v9.8h, v16.8h, v0.h[3]\n"
"ldr d16, [x17, #0xf0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0xf8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x108]\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[3]\n"
"ldr d17, [x17, #0x100]\n"
+ "ldr x20, [x17, #0x108]\n"
"fmla v11.8h, v16.8h, v0.h[3]\n"
"ldr d16, [x17, #0x110]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x118]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x128]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[4]\n"
"ldr d17, [x17, #0x120]\n"
+ "ldr x20, [x17, #0x128]\n"
"fmla v9.8h, v16.8h, v0.h[4]\n"
"ldr d16, [x17, #0x130]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x138]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x148]\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[4]\n"
"ldr d17, [x17, #0x140]\n"
+ "ldr x20, [x17, #0x148]\n"
"fmla v11.8h, v16.8h, v0.h[4]\n"
"ldr d16, [x17, #0x150]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x158]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x168]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[5]\n"
"ldr d17, [x17, #0x160]\n"
+ "ldr x20, [x17, #0x168]\n"
"fmla v9.8h, v16.8h, v0.h[5]\n"
"ldr d16, [x17, #0x170]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x178]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x188]\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[5]\n"
"ldr d17, [x17, #0x180]\n"
+ "ldr x20, [x17, #0x188]\n"
"fmla v11.8h, v16.8h, v0.h[5]\n"
"ldr d16, [x17, #0x190]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x198]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x1a8]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[6]\n"
"ldr d17, [x17, #0x1a0]\n"
+ "ldr x20, [x17, #0x1a8]\n"
"fmla v9.8h, v16.8h, v0.h[6]\n"
"ldr d16, [x17, #0x1b0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x1b8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x1c8]\n"
"mov v16.d[1], x20\n"
"fmla v10.8h, v17.8h, v0.h[6]\n"
"ldr d17, [x17, #0x1c0]\n"
+ "ldr x20, [x17, #0x1c8]\n"
"fmla v11.8h, v16.8h, v0.h[6]\n"
"ldr d16, [x17, #0x1d0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x1d8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x1e8]\n"
"mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[7]\n"
"ldr d17, [x17, #0x1e0]\n"
+ "ldr x20, [x17, #0x1e8]\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
"ldr d16, [x17, #0x1f0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x1f8]\n"
- "add x17, x17, #0x200\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x8]\n"
"mov v16.d[1], x20\n"
+ "add x13, x13, #0x10\n"
+ "add x17, x17, #0x200\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
"ldr d6, [x17, #0x0]\n"
+ "ldr x20, [x17, #0x8]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
+ "sub x14, x14, #0x8\n"
"ldr d7, [x17, #0x10]\n"
+ "cmp x14, #0x10\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
"ldr x20, [x17, #0x18]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x22\n"
+ "mov v0.d[1], x21\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"bge 26b\n"
"27:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"ldr q17, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"ldr q16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
- "sub x14, x14, #0x8\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x17, #0x40]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
"ldr q16, [x17, #0x50]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"fmla v8.8h, v17.8h, v0.h[1]\n"
"ldr q17, [x17, #0x60]\n"
"fmla v9.8h, v16.8h, v0.h[1]\n"
@@ -467,23 +462,26 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr q17, [x17, #0x1e0]\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
"ldr q16, [x17, #0x1f0]\n"
- "add x17, x17, #0x200\n"
+ "add x13, x13, #0x10\n"
+ "sub x14, x14, #0x8\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
"28:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 30f\n"
"29:" // Height 1: Multiply loop: Odd block loop
"ldr h0, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr q17, [x17, #0x0]\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v8.8h, v16.8h, v0.h[0]\n"
"ldr q16, [x17, #0x10]\n"
- "fmla v8.8h, v17.8h, v0.h[0]\n"
- "ldr q17, [x17, #0x20]\n"
"fmla v9.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x20]\n"
+ "fmla v10.8h, v16.8h, v0.h[0]\n"
"ldr q16, [x17, #0x30]\n"
- "add x17, x17, #0x40\n"
- "fmla v10.8h, v17.8h, v0.h[0]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
+ "add x17, x17, #0x40\n"
"cbnz x14, 29b\n"
"30:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -492,14 +490,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"bne 23b\n"
"prfm pstl1keep, [x16, #0x0]\n"
"tbz %x[flags], #1, 31f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v16.8h\n"
+ "fmin v9.8h, v9.8h, v16.8h\n"
+ "fmin v10.8h, v10.8h, v16.8h\n"
+ "fmin v11.8h, v11.8h, v16.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.8h }, [x21]\n"
"ld1r { v16.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v17.8h\n"
- "fmin v9.8h, v9.8h, v17.8h\n"
- "fmin v10.8h, v10.8h, v17.8h\n"
- "fmin v11.8h, v11.8h, v17.8h\n"
"fmax v8.8h, v8.8h, v16.8h\n"
"fmax v9.8h, v9.8h, v16.8h\n"
"fmax v10.8h, v10.8h, v16.8h\n"
@@ -612,168 +610,168 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"bgt 2b\n"
"b 296f\n"
"50:" // Height 2
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"51:" // Height 2: Column loop
"cbz x7, 52f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
+ "add x7, x7, #0x40\n"
"b 71f\n"
"52:" // Height 2: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x8, #0x20\n"
- "add x26, x16, x20, LSL #1\n"
+ "add x25, x16, x20, LSL #1\n"
"bge 69f\n"
"tbz x8, #4, 60f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
"ld1 { v9.8h }, [x16], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
"tbz x8, #3, 56f\n"
"ld1 { v10.8h }, [x16], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
"tbz x8, #2, 54f\n"
"ldr d11, [x16], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"tbz x8, #1, 53f\n"
"ld1 { v11.s }[2], [x16], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v11.h }[6], [x16]\n"
- "ld1 { v15.h }[6], [x26]\n"
+ "ld1 { v15.h }[6], [x25]\n"
"b 68f\n"
"53:" // Height 2: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x8, #0, 68f\n"
"ld1 { v11.h }[4], [x16]\n"
- "ld1 { v15.h }[4], [x26]\n"
+ "ld1 { v15.h }[4], [x25]\n"
"b 68f\n"
"54:" // Height 2: Partial accumulate: partial_2_24
"tbz x8, #1, 55f\n"
"ldr s11, [x16], #0x4\n"
"mov x20, #0x34\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v11.h }[2], [x16]\n"
- "ld1 { v15.h }[2], [x26]\n"
+ "ld1 { v15.h }[2], [x25]\n"
"b 68f\n"
"55:" // Height 2: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x8, #0, 68f\n"
"ldr h11, [x16, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
"b 68f\n"
"56:" // Height 2: Partial accumulate: partial_4_16
"tbz x8, #2, 58f\n"
"ldr d10, [x16], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"tbz x8, #1, 57f\n"
"ld1 { v10.s }[2], [x16], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v10.h }[6], [x16]\n"
- "ld1 { v14.h }[6], [x26]\n"
+ "ld1 { v14.h }[6], [x25]\n"
"b 68f\n"
"57:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x8, #0, 68f\n"
"ld1 { v10.h }[4], [x16]\n"
- "ld1 { v14.h }[4], [x26]\n"
+ "ld1 { v14.h }[4], [x25]\n"
"b 68f\n"
"58:" // Height 2: Partial accumulate: partial_2_16
"tbz x8, #1, 59f\n"
"ldr s10, [x16], #0x4\n"
"mov x20, #0x24\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v10.h }[2], [x16]\n"
- "ld1 { v14.h }[2], [x26]\n"
+ "ld1 { v14.h }[2], [x25]\n"
"b 68f\n"
"59:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x8, #0, 68f\n"
"ldr h10, [x16, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
"b 68f\n"
"60:" // Height 2: Partial accumulate: partial_8_0
"tbz x8, #3, 64f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
"tbz x8, #2, 62f\n"
"ldr d9, [x16], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"tbz x8, #1, 61f\n"
"ld1 { v9.s }[2], [x16], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v9.h }[6], [x16]\n"
- "ld1 { v13.h }[6], [x26]\n"
+ "ld1 { v13.h }[6], [x25]\n"
"b 68f\n"
"61:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x8, #0, 68f\n"
"ld1 { v9.h }[4], [x16]\n"
- "ld1 { v13.h }[4], [x26]\n"
+ "ld1 { v13.h }[4], [x25]\n"
"b 68f\n"
"62:" // Height 2: Partial accumulate: partial_2_8
"tbz x8, #1, 63f\n"
"ldr s9, [x16], #0x4\n"
"mov x20, #0x14\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v9.h }[2], [x16]\n"
- "ld1 { v13.h }[2], [x26]\n"
+ "ld1 { v13.h }[2], [x25]\n"
"b 68f\n"
"63:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x8, #0, 68f\n"
"ldr h9, [x16, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
"b 68f\n"
"64:" // Height 2: Partial accumulate: partial_4_0
"tbz x8, #2, 66f\n"
"ldr d8, [x16], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"tbz x8, #1, 65f\n"
"ld1 { v8.s }[2], [x16], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v8.h }[6], [x16]\n"
- "ld1 { v12.h }[6], [x26]\n"
+ "ld1 { v12.h }[6], [x25]\n"
"b 68f\n"
"65:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x8, #0, 68f\n"
"ld1 { v8.h }[4], [x16]\n"
- "ld1 { v12.h }[4], [x26]\n"
+ "ld1 { v12.h }[4], [x25]\n"
"b 68f\n"
"66:" // Height 2: Partial accumulate: partial_2_0
"tbz x8, #1, 67f\n"
"ldr s8, [x16], #0x4\n"
"mov x20, #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"tbz x8, #0, 68f\n"
"ld1 { v8.h }[2], [x16]\n"
- "ld1 { v12.h }[2], [x26]\n"
+ "ld1 { v12.h }[2], [x25]\n"
"b 68f\n"
"67:" // Height 2: Partial accumulate: partial_1_0
"ldr h8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"68:" // Height 2: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 71f\n"
@@ -782,10 +780,10 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 71f\n"
"70:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -800,8 +798,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"mov x15, #0x0\n"
"72:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -826,178 +824,178 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"blt 76f\n"
"75:" // Height 2: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
"ldr d17, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"ldr d16, [x17, #0x30]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x48]\n"
- "add x13, x13, #0x10\n"
- "add x12, x12, #0x10\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[0]\n"
"ldr d17, [x17, #0x40]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "ldr x20, [x17, #0x48]\n"
"fmla v15.8h, v16.8h, v1.h[0]\n"
"ldr d16, [x17, #0x50]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x68]\n"
- "ldr x23, [x13, #0x8]\n"
- "sub x14, x14, #0x8\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x58]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v12.8h, v17.8h, v1.h[1]\n"
"ldr d17, [x17, #0x60]\n"
"fmla v9.8h, v16.8h, v0.h[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v13.8h, v16.8h, v1.h[1]\n"
"ldr d16, [x17, #0x70]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x88]\n"
- "ldr x22, [x12, #0x8]\n"
- "cmp x14, #0x10\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[1]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[1]\n"
"ldr d17, [x17, #0x80]\n"
"fmla v11.8h, v16.8h, v0.h[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "ldr x20, [x17, #0x88]\n"
"fmla v15.8h, v16.8h, v1.h[1]\n"
"ldr d16, [x17, #0x90]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xa8]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x98]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v12.8h, v17.8h, v1.h[2]\n"
"ldr d17, [x17, #0xa0]\n"
"fmla v9.8h, v16.8h, v0.h[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v13.8h, v16.8h, v1.h[2]\n"
"ldr d16, [x17, #0xb0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xc8]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[2]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[2]\n"
"ldr d17, [x17, #0xc0]\n"
"fmla v11.8h, v16.8h, v0.h[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "ldr x20, [x17, #0xc8]\n"
"fmla v15.8h, v16.8h, v1.h[2]\n"
"ldr d16, [x17, #0xd0]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xe8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v12.8h, v17.8h, v1.h[3]\n"
"ldr d17, [x17, #0xe0]\n"
"fmla v9.8h, v16.8h, v0.h[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v13.8h, v16.8h, v1.h[3]\n"
"ldr d16, [x17, #0xf0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x108]\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[3]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[3]\n"
"ldr d17, [x17, #0x100]\n"
"fmla v11.8h, v16.8h, v0.h[3]\n"
- "ldr x21, [x17, #0x118]\n"
+ "ldr x20, [x17, #0x108]\n"
"fmla v15.8h, v16.8h, v1.h[3]\n"
"ldr d16, [x17, #0x110]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x128]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x118]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr x21, [x17, #0x128]\n"
"fmla v12.8h, v17.8h, v1.h[4]\n"
"ldr d17, [x17, #0x120]\n"
"fmla v9.8h, v16.8h, v0.h[4]\n"
- "ldr x21, [x17, #0x138]\n"
+ "ldr x20, [x17, #0x138]\n"
"fmla v13.8h, v16.8h, v1.h[4]\n"
"ldr d16, [x17, #0x130]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x148]\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[4]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[4]\n"
"ldr d17, [x17, #0x140]\n"
"fmla v11.8h, v16.8h, v0.h[4]\n"
- "ldr x21, [x17, #0x158]\n"
+ "ldr x20, [x17, #0x148]\n"
"fmla v15.8h, v16.8h, v1.h[4]\n"
"ldr d16, [x17, #0x150]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x168]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x158]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr x21, [x17, #0x168]\n"
"fmla v12.8h, v17.8h, v1.h[5]\n"
"ldr d17, [x17, #0x160]\n"
"fmla v9.8h, v16.8h, v0.h[5]\n"
- "ldr x21, [x17, #0x178]\n"
+ "ldr x20, [x17, #0x178]\n"
"fmla v13.8h, v16.8h, v1.h[5]\n"
"ldr d16, [x17, #0x170]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x188]\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[5]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[5]\n"
"ldr d17, [x17, #0x180]\n"
"fmla v11.8h, v16.8h, v0.h[5]\n"
- "ldr x21, [x17, #0x198]\n"
+ "ldr x20, [x17, #0x188]\n"
"fmla v15.8h, v16.8h, v1.h[5]\n"
"ldr d16, [x17, #0x190]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x1a8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x198]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr x21, [x17, #0x1a8]\n"
"fmla v12.8h, v17.8h, v1.h[6]\n"
"ldr d17, [x17, #0x1a0]\n"
"fmla v9.8h, v16.8h, v0.h[6]\n"
- "ldr x21, [x17, #0x1b8]\n"
+ "ldr x20, [x17, #0x1b8]\n"
"fmla v13.8h, v16.8h, v1.h[6]\n"
"ldr d16, [x17, #0x1b0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x1c8]\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[6]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.8h, v17.8h, v1.h[6]\n"
"ldr d17, [x17, #0x1c0]\n"
"fmla v11.8h, v16.8h, v0.h[6]\n"
- "ldr x21, [x17, #0x1d8]\n"
+ "ldr x20, [x17, #0x1c8]\n"
"fmla v15.8h, v16.8h, v1.h[6]\n"
"ldr d16, [x17, #0x1d0]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x1e8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x1d8]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr x21, [x17, #0x1e8]\n"
"fmla v12.8h, v17.8h, v1.h[7]\n"
"ldr d17, [x17, #0x1e0]\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
- "ldr x21, [x17, #0x1f8]\n"
+ "ldr x20, [x17, #0x1f8]\n"
"fmla v13.8h, v16.8h, v1.h[7]\n"
"ldr d16, [x17, #0x1f0]\n"
- "mov v17.d[1], x20\n"
+ "mov v17.d[1], x21\n"
+ "add x13, x13, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"add x17, x17, #0x200\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v16.d[1], x21\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
"fmla v14.8h, v17.8h, v1.h[7]\n"
"ldr d6, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
"fmla v15.8h, v16.8h, v1.h[7]\n"
"ldr d1, [x12, #0x0]\n"
+ "sub x14, x14, #0x8\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
+ "cmp x14, #0x10\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
"ldr x20, [x17, #0x18]\n"
- "mov v0.d[1], x23\n"
- "mov v1.d[1], x22\n"
+ "mov v1.d[1], x21\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 75b\n"
"76:" // Height 2: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
@@ -1107,8 +1105,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"sub x14, x14, #0x1\n"
"ldr h0, [x12], #0x2\n"
"ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
"fmla v8.8h, v17.8h, v1.h[0]\n"
+ "ldr q16, [x17, #0x10]\n"
"fmla v12.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x17, #0x20]\n"
"fmla v9.8h, v16.8h, v1.h[0]\n"
@@ -1126,22 +1124,22 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"cmp x15, x20\n"
"bne 72b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x16, x20, LSL #1\n"
"prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v16.8h\n"
+ "fmin v9.8h, v9.8h, v16.8h\n"
+ "fmin v10.8h, v10.8h, v16.8h\n"
+ "fmin v11.8h, v11.8h, v16.8h\n"
+ "fmin v12.8h, v12.8h, v16.8h\n"
+ "fmin v13.8h, v13.8h, v16.8h\n"
+ "fmin v14.8h, v14.8h, v16.8h\n"
+ "fmin v15.8h, v15.8h, v16.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.8h }, [x21]\n"
"ld1r { v16.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v17.8h\n"
- "fmin v9.8h, v9.8h, v17.8h\n"
- "fmin v10.8h, v10.8h, v17.8h\n"
- "fmin v11.8h, v11.8h, v17.8h\n"
- "fmin v12.8h, v12.8h, v17.8h\n"
- "fmin v13.8h, v13.8h, v17.8h\n"
- "fmin v14.8h, v14.8h, v17.8h\n"
- "fmin v15.8h, v15.8h, v17.8h\n"
"fmax v8.8h, v8.8h, v16.8h\n"
"fmax v9.8h, v9.8h, v16.8h\n"
"fmax v10.8h, v10.8h, v16.8h\n"
@@ -1156,127 +1154,127 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"tbz x8, #4, 88f\n"
"st1 { v8.8h }, [x16], #0x10\n"
"st1 { v9.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
"tbz x8, #3, 84f\n"
"st1 { v10.8h }, [x16], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
"tbz x8, #2, 82f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x8, #1, 81f\n"
"st1 { v11.s }[2], [x16], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v11.h }[6], [x16]\n"
- "st1 { v15.h }[6], [x26]\n"
+ "st1 { v15.h }[6], [x25]\n"
"b 96f\n"
"81:" // Height 2: Partial direct writeback: partial_1_28
"tbz x8, #0, 96f\n"
"st1 { v11.h }[4], [x16]\n"
- "st1 { v15.h }[4], [x26]\n"
+ "st1 { v15.h }[4], [x25]\n"
"b 96f\n"
"82:" // Height 2: Partial direct writeback: partial_2_24
"tbz x8, #1, 83f\n"
"str s11, [x16], #0x4\n"
- "str s15, [x26], #0x4\n"
+ "str s15, [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v11.h }[2], [x16]\n"
- "st1 { v15.h }[2], [x26]\n"
+ "st1 { v15.h }[2], [x25]\n"
"b 96f\n"
"83:" // Height 2: Partial direct writeback: partial_1_24
"tbz x8, #0, 96f\n"
"str h11, [x16, #0x0]\n"
- "str h15, [x26, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
"b 96f\n"
"84:" // Height 2: Partial direct writeback: partial_4_16
"tbz x8, #2, 86f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x8, #1, 85f\n"
"st1 { v10.s }[2], [x16], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v10.h }[6], [x16]\n"
- "st1 { v14.h }[6], [x26]\n"
+ "st1 { v14.h }[6], [x25]\n"
"b 96f\n"
"85:" // Height 2: Partial direct writeback: partial_1_20
"tbz x8, #0, 96f\n"
"st1 { v10.h }[4], [x16]\n"
- "st1 { v14.h }[4], [x26]\n"
+ "st1 { v14.h }[4], [x25]\n"
"b 96f\n"
"86:" // Height 2: Partial direct writeback: partial_2_16
"tbz x8, #1, 87f\n"
"str s10, [x16], #0x4\n"
- "str s14, [x26], #0x4\n"
+ "str s14, [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v10.h }[2], [x16]\n"
- "st1 { v14.h }[2], [x26]\n"
+ "st1 { v14.h }[2], [x25]\n"
"b 96f\n"
"87:" // Height 2: Partial direct writeback: partial_1_16
"tbz x8, #0, 96f\n"
"str h10, [x16, #0x0]\n"
- "str h14, [x26, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
"b 96f\n"
"88:" // Height 2: Partial direct writeback: partial_8_0
"tbz x8, #3, 92f\n"
"st1 { v8.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
"tbz x8, #2, 90f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x8, #1, 89f\n"
"st1 { v9.s }[2], [x16], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v9.h }[6], [x16]\n"
- "st1 { v13.h }[6], [x26]\n"
+ "st1 { v13.h }[6], [x25]\n"
"b 96f\n"
"89:" // Height 2: Partial direct writeback: partial_1_12
"tbz x8, #0, 96f\n"
"st1 { v9.h }[4], [x16]\n"
- "st1 { v13.h }[4], [x26]\n"
+ "st1 { v13.h }[4], [x25]\n"
"b 96f\n"
"90:" // Height 2: Partial direct writeback: partial_2_8
"tbz x8, #1, 91f\n"
"str s9, [x16], #0x4\n"
- "str s13, [x26], #0x4\n"
+ "str s13, [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v9.h }[2], [x16]\n"
- "st1 { v13.h }[2], [x26]\n"
+ "st1 { v13.h }[2], [x25]\n"
"b 96f\n"
"91:" // Height 2: Partial direct writeback: partial_1_8
"tbz x8, #0, 96f\n"
"str h9, [x16, #0x0]\n"
- "str h13, [x26, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
"b 96f\n"
"92:" // Height 2: Partial direct writeback: partial_4_0
"tbz x8, #2, 94f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x8, #1, 93f\n"
"st1 { v8.s }[2], [x16], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v8.h }[6], [x16]\n"
- "st1 { v12.h }[6], [x26]\n"
+ "st1 { v12.h }[6], [x25]\n"
"b 96f\n"
"93:" // Height 2: Partial direct writeback: partial_1_4
"tbz x8, #0, 96f\n"
"st1 { v8.h }[4], [x16]\n"
- "st1 { v12.h }[4], [x26]\n"
+ "st1 { v12.h }[4], [x25]\n"
"b 96f\n"
"94:" // Height 2: Partial direct writeback: partial_2_0
"tbz x8, #1, 95f\n"
"str s8, [x16], #0x4\n"
- "str s12, [x26], #0x4\n"
+ "str s12, [x25], #0x4\n"
"tbz x8, #0, 96f\n"
"st1 { v8.h }[2], [x16]\n"
- "st1 { v12.h }[2], [x26]\n"
+ "st1 { v12.h }[2], [x25]\n"
"b 96f\n"
"95:" // Height 2: Partial direct writeback: partial_1_0
"str h8, [x16, #0x0]\n"
- "str h12, [x26, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
"96:" // Height 2: Partial direct writeback: Done
"b 98f\n"
"97:" // Height 2: Full writeback
@@ -1285,31 +1283,31 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"98:" // Height 2: Writeback done
"subs x8, x8, #0x20\n"
"bgt 51b\n"
"b 296f\n"
"99:" // Height 3
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"100:" // Height 3: Column loop
"cbz x7, 101f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -1317,182 +1315,182 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"101:" // Height 3: no bias
"tbz %x[flags], #0, 119f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x16, x20, LSL #1\n"
"cmp x8, #0x20\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"bge 118f\n"
"tbz x8, #4, 109f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
"ld1 { v9.8h }, [x16], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
"tbz x8, #3, 105f\n"
"ld1 { v10.8h }, [x16], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
"tbz x8, #2, 103f\n"
"ldr d11, [x16], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x8, #1, 102f\n"
"ld1 { v11.s }[2], [x16], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v11.h }[6], [x16]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
"b 117f\n"
"102:" // Height 3: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x8, #0, 117f\n"
"ld1 { v11.h }[4], [x16]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
"b 117f\n"
"103:" // Height 3: Partial accumulate: partial_2_24
"tbz x8, #1, 104f\n"
"ldr s11, [x16], #0x4\n"
"mov x20, #0x34\n"
- "ldr s15, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v11.h }[2], [x16]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
"b 117f\n"
"104:" // Height 3: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x8, #0, 117f\n"
"ldr h11, [x16, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
"b 117f\n"
"105:" // Height 3: Partial accumulate: partial_4_16
"tbz x8, #2, 107f\n"
"ldr d10, [x16], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x8, #1, 106f\n"
"ld1 { v10.s }[2], [x16], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v10.h }[6], [x16]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
"b 117f\n"
"106:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x8, #0, 117f\n"
"ld1 { v10.h }[4], [x16]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
"b 117f\n"
"107:" // Height 3: Partial accumulate: partial_2_16
"tbz x8, #1, 108f\n"
"ldr s10, [x16], #0x4\n"
"mov x20, #0x24\n"
- "ldr s14, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v10.h }[2], [x16]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
"b 117f\n"
"108:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x8, #0, 117f\n"
"ldr h10, [x16, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
"b 117f\n"
"109:" // Height 3: Partial accumulate: partial_8_0
"tbz x8, #3, 113f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
"tbz x8, #2, 111f\n"
"ldr d9, [x16], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x8, #1, 110f\n"
"ld1 { v9.s }[2], [x16], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v9.h }[6], [x16]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
"b 117f\n"
"110:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x8, #0, 117f\n"
"ld1 { v9.h }[4], [x16]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
"b 117f\n"
"111:" // Height 3: Partial accumulate: partial_2_8
"tbz x8, #1, 112f\n"
"ldr s9, [x16], #0x4\n"
"mov x20, #0x14\n"
- "ldr s13, [x26], #0x4\n"
- "ldr s17, [x25], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v9.h }[2], [x16]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
"b 117f\n"
"112:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x8, #0, 117f\n"
"ldr h9, [x16, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
"b 117f\n"
"113:" // Height 3: Partial accumulate: partial_4_0
"tbz x8, #2, 115f\n"
"ldr d8, [x16], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x8, #1, 114f\n"
"ld1 { v8.s }[2], [x16], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v8.h }[6], [x16]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
"b 117f\n"
"114:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x8, #0, 117f\n"
"ld1 { v8.h }[4], [x16]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
"b 117f\n"
"115:" // Height 3: Partial accumulate: partial_2_0
"tbz x8, #1, 116f\n"
"ldr s8, [x16], #0x4\n"
"mov x20, #0x4\n"
- "ldr s12, [x26], #0x4\n"
- "ldr s16, [x25], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
"tbz x8, #0, 117f\n"
"ld1 { v8.h }[2], [x16]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
"b 117f\n"
"116:" // Height 3: Partial accumulate: partial_1_0
"ldr h8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h12, [x26, #0x0]\n"
- "ldr h16, [x25, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
"117:" // Height 3: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 120f\n"
@@ -1501,14 +1499,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 120f\n"
"119:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -1527,8 +1525,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"mov x15, #0x0\n"
"121:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 122f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1563,15 +1561,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v6.8h, v2.h[0]\n"
"ldr d21, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x13, x13, #0x10\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x12, x12, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"ldr d20, [x17, #0x30]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "add x11, x11, #0x10\n"
- "ldr x24, [x13, #0x8]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[0]\n"
"fmla v14.8h, v21.8h, v1.h[0]\n"
@@ -1579,15 +1573,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[0]\n"
"ldr d21, [x17, #0x40]\n"
"fmla v11.8h, v20.8h, v0.h[0]\n"
- "ldr x23, [x12, #0x8]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[0]\n"
- "ldr x22, [x11, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.8h, v20.8h, v2.h[0]\n"
"ldr d20, [x17, #0x50]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "sub x14, x14, #0x8\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[1]\n"
"fmla v12.8h, v21.8h, v1.h[1]\n"
@@ -1595,14 +1585,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[1]\n"
"ldr d21, [x17, #0x60]\n"
"fmla v9.8h, v20.8h, v0.h[1]\n"
- "cmp x14, #0x10\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.8h, v20.8h, v2.h[1]\n"
"ldr d20, [x17, #0x70]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[1]\n"
"fmla v14.8h, v21.8h, v1.h[1]\n"
@@ -1610,11 +1597,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[1]\n"
"ldr d21, [x17, #0x80]\n"
"fmla v11.8h, v20.8h, v0.h[1]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.8h, v20.8h, v2.h[1]\n"
"ldr d20, [x17, #0x90]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[2]\n"
"fmla v12.8h, v21.8h, v1.h[2]\n"
@@ -1622,11 +1609,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[2]\n"
"ldr d21, [x17, #0xa0]\n"
"fmla v9.8h, v20.8h, v0.h[2]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.8h, v20.8h, v2.h[2]\n"
"ldr d20, [x17, #0xb0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[2]\n"
"fmla v14.8h, v21.8h, v1.h[2]\n"
@@ -1634,11 +1621,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[2]\n"
"ldr d21, [x17, #0xc0]\n"
"fmla v11.8h, v20.8h, v0.h[2]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.8h, v20.8h, v2.h[2]\n"
"ldr d20, [x17, #0xd0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[3]\n"
"fmla v12.8h, v21.8h, v1.h[3]\n"
@@ -1646,11 +1633,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[3]\n"
"ldr d21, [x17, #0xe0]\n"
"fmla v9.8h, v20.8h, v0.h[3]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
"fmla v17.8h, v20.8h, v2.h[3]\n"
"ldr d20, [x17, #0xf0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x108]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[3]\n"
"fmla v14.8h, v21.8h, v1.h[3]\n"
@@ -1658,11 +1645,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[3]\n"
"ldr d21, [x17, #0x100]\n"
"fmla v11.8h, v20.8h, v0.h[3]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
"fmla v19.8h, v20.8h, v2.h[3]\n"
"ldr d20, [x17, #0x110]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x128]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[4]\n"
"fmla v12.8h, v21.8h, v1.h[4]\n"
@@ -1670,11 +1657,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[4]\n"
"ldr d21, [x17, #0x120]\n"
"fmla v9.8h, v20.8h, v0.h[4]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
"fmla v17.8h, v20.8h, v2.h[4]\n"
"ldr d20, [x17, #0x130]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x148]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[4]\n"
"fmla v14.8h, v21.8h, v1.h[4]\n"
@@ -1682,11 +1669,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[4]\n"
"ldr d21, [x17, #0x140]\n"
"fmla v11.8h, v20.8h, v0.h[4]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
"fmla v19.8h, v20.8h, v2.h[4]\n"
"ldr d20, [x17, #0x150]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x168]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[5]\n"
"fmla v12.8h, v21.8h, v1.h[5]\n"
@@ -1694,11 +1681,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[5]\n"
"ldr d21, [x17, #0x160]\n"
"fmla v9.8h, v20.8h, v0.h[5]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
"fmla v17.8h, v20.8h, v2.h[5]\n"
"ldr d20, [x17, #0x170]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x188]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[5]\n"
"fmla v14.8h, v21.8h, v1.h[5]\n"
@@ -1706,11 +1693,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[5]\n"
"ldr d21, [x17, #0x180]\n"
"fmla v11.8h, v20.8h, v0.h[5]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
"fmla v19.8h, v20.8h, v2.h[5]\n"
"ldr d20, [x17, #0x190]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x1a8]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[6]\n"
"fmla v12.8h, v21.8h, v1.h[6]\n"
@@ -1718,11 +1705,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[6]\n"
"ldr d21, [x17, #0x1a0]\n"
"fmla v9.8h, v20.8h, v0.h[6]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
"fmla v17.8h, v20.8h, v2.h[6]\n"
"ldr d20, [x17, #0x1b0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x1c8]\n"
"mov v20.d[1], x20\n"
"fmla v10.8h, v21.8h, v0.h[6]\n"
"fmla v14.8h, v21.8h, v1.h[6]\n"
@@ -1730,11 +1717,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v18.8h, v21.8h, v2.h[6]\n"
"ldr d21, [x17, #0x1c0]\n"
"fmla v11.8h, v20.8h, v0.h[6]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.8h, v20.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
"fmla v19.8h, v20.8h, v2.h[6]\n"
"ldr d20, [x17, #0x1d0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x1e8]\n"
"mov v20.d[1], x20\n"
"fmla v8.8h, v21.8h, v0.h[7]\n"
"fmla v12.8h, v21.8h, v1.h[7]\n"
@@ -1742,29 +1729,40 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v16.8h, v21.8h, v2.h[7]\n"
"ldr d21, [x17, #0x1e0]\n"
"fmla v9.8h, v20.8h, v0.h[7]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v20.8h, v1.h[7]\n"
+ "add x13, x13, #0x10\n"
"fmla v17.8h, v20.8h, v2.h[7]\n"
"ldr d20, [x17, #0x1f0]\n"
- "mov v21.d[1], x21\n"
- "add x17, x17, #0x200\n"
- "ldr x21, [x17, #0x8]\n"
"mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x17, x17, #0x200\n"
"fmla v10.8h, v21.8h, v0.h[7]\n"
+ "ldr x20, [x17, #0x8]\n"
"fmla v14.8h, v21.8h, v1.h[7]\n"
- "ldr x20, [x17, #0x18]\n"
+ "ldr x23, [x13, #0x8]\n"
"fmla v18.8h, v21.8h, v2.h[7]\n"
"ldr d6, [x17, #0x0]\n"
"fmla v11.8h, v20.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
"fmla v15.8h, v20.8h, v1.h[7]\n"
"ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
"fmla v19.8h, v20.8h, v2.h[7]\n"
"ldr d2, [x11, #0x0]\n"
+ "sub x14, x14, #0x8\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x24\n"
- "mov v1.d[1], x23\n"
- "mov v2.d[1], x22\n"
+ "cmp x14, #0x10\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v0.d[1], x23\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"mov v7.d[1], x20\n"
"bge 124b\n"
"125:" // Height 3: Multiply loop: Single iteration only
@@ -1910,8 +1908,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr h1, [x12], #0x2\n"
"ldr h0, [x11], #0x2\n"
"ldr q21, [x17, #0x0]\n"
- "ldr q20, [x17, #0x10]\n"
"fmla v8.8h, v21.8h, v2.h[0]\n"
+ "ldr q20, [x17, #0x10]\n"
"fmla v12.8h, v21.8h, v1.h[0]\n"
"fmla v16.8h, v21.8h, v0.h[0]\n"
"ldr q21, [x17, #0x20]\n"
@@ -1933,28 +1931,28 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"cmp x15, x20\n"
"bne 121b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x16, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 129f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v20.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v20.8h\n"
+ "fmin v9.8h, v9.8h, v20.8h\n"
+ "fmin v10.8h, v10.8h, v20.8h\n"
+ "fmin v11.8h, v11.8h, v20.8h\n"
+ "fmin v12.8h, v12.8h, v20.8h\n"
+ "fmin v13.8h, v13.8h, v20.8h\n"
+ "fmin v14.8h, v14.8h, v20.8h\n"
+ "fmin v15.8h, v15.8h, v20.8h\n"
+ "fmin v16.8h, v16.8h, v20.8h\n"
+ "fmin v17.8h, v17.8h, v20.8h\n"
+ "fmin v18.8h, v18.8h, v20.8h\n"
+ "fmin v19.8h, v19.8h, v20.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.8h }, [x21]\n"
"ld1r { v20.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v21.8h\n"
- "fmin v9.8h, v9.8h, v21.8h\n"
- "fmin v10.8h, v10.8h, v21.8h\n"
- "fmin v11.8h, v11.8h, v21.8h\n"
- "fmin v12.8h, v12.8h, v21.8h\n"
- "fmin v13.8h, v13.8h, v21.8h\n"
- "fmin v14.8h, v14.8h, v21.8h\n"
- "fmin v15.8h, v15.8h, v21.8h\n"
- "fmin v16.8h, v16.8h, v21.8h\n"
- "fmin v17.8h, v17.8h, v21.8h\n"
- "fmin v18.8h, v18.8h, v21.8h\n"
- "fmin v19.8h, v19.8h, v21.8h\n"
"fmax v8.8h, v8.8h, v20.8h\n"
"fmax v9.8h, v9.8h, v20.8h\n"
"fmax v10.8h, v10.8h, v20.8h\n"
@@ -1973,159 +1971,159 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"tbz x8, #4, 137f\n"
"st1 { v8.8h }, [x16], #0x10\n"
"st1 { v9.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
"tbz x8, #3, 133f\n"
"st1 { v10.8h }, [x16], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
"tbz x8, #2, 131f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x8, #1, 130f\n"
"st1 { v11.s }[2], [x16], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v11.h }[6], [x16]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
"b 145f\n"
"130:" // Height 3: Partial direct writeback: partial_1_28
"tbz x8, #0, 145f\n"
"st1 { v11.h }[4], [x16]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
"b 145f\n"
"131:" // Height 3: Partial direct writeback: partial_2_24
"tbz x8, #1, 132f\n"
"str s11, [x16], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v11.h }[2], [x16]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
"b 145f\n"
"132:" // Height 3: Partial direct writeback: partial_1_24
"tbz x8, #0, 145f\n"
"str h11, [x16, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
"b 145f\n"
"133:" // Height 3: Partial direct writeback: partial_4_16
"tbz x8, #2, 135f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x8, #1, 134f\n"
"st1 { v10.s }[2], [x16], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v10.h }[6], [x16]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
"b 145f\n"
"134:" // Height 3: Partial direct writeback: partial_1_20
"tbz x8, #0, 145f\n"
"st1 { v10.h }[4], [x16]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
"b 145f\n"
"135:" // Height 3: Partial direct writeback: partial_2_16
"tbz x8, #1, 136f\n"
"str s10, [x16], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v10.h }[2], [x16]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
"b 145f\n"
"136:" // Height 3: Partial direct writeback: partial_1_16
"tbz x8, #0, 145f\n"
"str h10, [x16, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
"b 145f\n"
"137:" // Height 3: Partial direct writeback: partial_8_0
"tbz x8, #3, 141f\n"
"st1 { v8.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
"tbz x8, #2, 139f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x8, #1, 138f\n"
"st1 { v9.s }[2], [x16], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v9.h }[6], [x16]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
"b 145f\n"
"138:" // Height 3: Partial direct writeback: partial_1_12
"tbz x8, #0, 145f\n"
"st1 { v9.h }[4], [x16]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
"b 145f\n"
"139:" // Height 3: Partial direct writeback: partial_2_8
"tbz x8, #1, 140f\n"
"str s9, [x16], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v9.h }[2], [x16]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
"b 145f\n"
"140:" // Height 3: Partial direct writeback: partial_1_8
"tbz x8, #0, 145f\n"
"str h9, [x16, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
"b 145f\n"
"141:" // Height 3: Partial direct writeback: partial_4_0
"tbz x8, #2, 143f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x8, #1, 142f\n"
"st1 { v8.s }[2], [x16], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v8.h }[6], [x16]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
"b 145f\n"
"142:" // Height 3: Partial direct writeback: partial_1_4
"tbz x8, #0, 145f\n"
"st1 { v8.h }[4], [x16]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
"b 145f\n"
"143:" // Height 3: Partial direct writeback: partial_2_0
"tbz x8, #1, 144f\n"
"str s8, [x16], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
"tbz x8, #0, 145f\n"
"st1 { v8.h }[2], [x16]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
"b 145f\n"
"144:" // Height 3: Partial direct writeback: partial_1_0
"str h8, [x16, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
"145:" // Height 3: Partial direct writeback: Done
"b 147f\n"
"146:" // Height 3: Full writeback
@@ -2134,35 +2132,35 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"147:" // Height 3: Writeback done
"subs x8, x8, #0x20\n"
"bgt 100b\n"
"b 296f\n"
"148:" // Height 4
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"149:" // Height 4: Column loop
"cbz x7, 150f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -2174,215 +2172,215 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"150:" // Height 4: no bias
"tbz %x[flags], #0, 168f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x20\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x16, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "cmp x8, #0x20\n"
+ "add x23, x24, x20, LSL #1\n"
"bge 167f\n"
"tbz x8, #4, 158f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
"ld1 { v9.8h }, [x16], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
"tbz x8, #3, 154f\n"
"ld1 { v10.8h }, [x16], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
"tbz x8, #2, 152f\n"
"ldr d11, [x16], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x8, #1, 151f\n"
"ld1 { v11.s }[2], [x16], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v11.h }[6], [x16]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
"b 166f\n"
"151:" // Height 4: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x8, #0, 166f\n"
"ld1 { v11.h }[4], [x16]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
"b 166f\n"
"152:" // Height 4: Partial accumulate: partial_2_24
"tbz x8, #1, 153f\n"
"ldr s11, [x16], #0x4\n"
"mov x20, #0x34\n"
- "ldr s15, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v11.h }[2], [x16]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
"b 166f\n"
"153:" // Height 4: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x8, #0, 166f\n"
"ldr h11, [x16, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
"b 166f\n"
"154:" // Height 4: Partial accumulate: partial_4_16
"tbz x8, #2, 156f\n"
"ldr d10, [x16], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x8, #1, 155f\n"
"ld1 { v10.s }[2], [x16], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v10.h }[6], [x16]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
"b 166f\n"
"155:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x8, #0, 166f\n"
"ld1 { v10.h }[4], [x16]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
"b 166f\n"
"156:" // Height 4: Partial accumulate: partial_2_16
"tbz x8, #1, 157f\n"
"ldr s10, [x16], #0x4\n"
"mov x20, #0x24\n"
- "ldr s14, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v10.h }[2], [x16]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
"b 166f\n"
"157:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x8, #0, 166f\n"
"ldr h10, [x16, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
"b 166f\n"
"158:" // Height 4: Partial accumulate: partial_8_0
"tbz x8, #3, 162f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
"tbz x8, #2, 160f\n"
"ldr d9, [x16], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x8, #1, 159f\n"
"ld1 { v9.s }[2], [x16], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v9.h }[6], [x16]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
"b 166f\n"
"159:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x8, #0, 166f\n"
"ld1 { v9.h }[4], [x16]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
"b 166f\n"
"160:" // Height 4: Partial accumulate: partial_2_8
"tbz x8, #1, 161f\n"
"ldr s9, [x16], #0x4\n"
"mov x20, #0x14\n"
- "ldr s13, [x26], #0x4\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v9.h }[2], [x16]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
"b 166f\n"
"161:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x8, #0, 166f\n"
"ldr h9, [x16, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
"b 166f\n"
"162:" // Height 4: Partial accumulate: partial_4_0
"tbz x8, #2, 164f\n"
"ldr d8, [x16], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x8, #1, 163f\n"
"ld1 { v8.s }[2], [x16], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v8.h }[6], [x16]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
"b 166f\n"
"163:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x8, #0, 166f\n"
"ld1 { v8.h }[4], [x16]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
"b 166f\n"
"164:" // Height 4: Partial accumulate: partial_2_0
"tbz x8, #1, 165f\n"
"ldr s8, [x16], #0x4\n"
"mov x20, #0x4\n"
- "ldr s12, [x26], #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
"tbz x8, #0, 166f\n"
"ld1 { v8.h }[2], [x16]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
"b 166f\n"
"165:" // Height 4: Partial accumulate: partial_1_0
"ldr h8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h12, [x26, #0x0]\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
"166:" // Height 4: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 169f\n"
@@ -2391,18 +2389,18 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 169f\n"
"168:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -2425,8 +2423,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"mov x15, #0x0\n"
"170:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 171f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2459,233 +2457,234 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"blt 174f\n"
"173:" // Height 4: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
"ldr d25, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x12, x12, #0x10\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "mov v25.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"ldr d24, [x17, #0x30]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[0]\n"
- "ldr x20, [x17, #0x48]\n"
"fmla v14.8h, v25.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.8h, v25.8h, v2.h[0]\n"
+ "add x11, x11, #0x10\n"
"fmla v22.8h, v25.8h, v3.h[0]\n"
"ldr d25, [x17, #0x40]\n"
"fmla v11.8h, v24.8h, v0.h[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[0]\n"
- "ldr x25, [x13, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.8h, v24.8h, v2.h[0]\n"
- "mov v25.d[1], x20\n"
+ "add x10, x10, #0x10\n"
"fmla v23.8h, v24.8h, v3.h[0]\n"
"ldr d24, [x17, #0x50]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[1]\n"
- "ldr x20, [x17, #0x68]\n"
"fmla v12.8h, v25.8h, v1.h[1]\n"
- "ldr x24, [x12, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.8h, v25.8h, v2.h[1]\n"
+ "ldr x25, [x13, #0x8]\n"
"fmla v20.8h, v25.8h, v3.h[1]\n"
"ldr d25, [x17, #0x60]\n"
"fmla v9.8h, v24.8h, v0.h[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[1]\n"
- "ldr x23, [x11, #0x8]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.8h, v24.8h, v2.h[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x24, [x12, #0x8]\n"
"fmla v21.8h, v24.8h, v3.h[1]\n"
"ldr d24, [x17, #0x70]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[1]\n"
- "ldr x20, [x17, #0x88]\n"
"fmla v14.8h, v25.8h, v1.h[1]\n"
- "ldr x22, [x10, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.8h, v25.8h, v2.h[1]\n"
+ "ldr x23, [x11, #0x8]\n"
"fmla v22.8h, v25.8h, v3.h[1]\n"
"ldr d25, [x17, #0x80]\n"
"fmla v11.8h, v24.8h, v0.h[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[1]\n"
- "sub x14, x14, #0x8\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.8h, v24.8h, v2.h[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x22, [x10, #0x8]\n"
"fmla v23.8h, v24.8h, v3.h[1]\n"
"ldr d24, [x17, #0x90]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[2]\n"
- "ldr x20, [x17, #0xa8]\n"
"fmla v12.8h, v25.8h, v1.h[2]\n"
- "cmp x14, #0x10\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.8h, v25.8h, v2.h[2]\n"
+ "sub x14, x14, #0x8\n"
"fmla v20.8h, v25.8h, v3.h[2]\n"
"ldr d25, [x17, #0xa0]\n"
"fmla v9.8h, v24.8h, v0.h[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[2]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.8h, v24.8h, v2.h[2]\n"
- "mov v25.d[1], x20\n"
+ "cmp x14, #0x10\n"
"fmla v21.8h, v24.8h, v3.h[2]\n"
"ldr d24, [x17, #0xb0]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[2]\n"
- "ldr x20, [x17, #0xc8]\n"
"fmla v14.8h, v25.8h, v1.h[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.8h, v25.8h, v2.h[2]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v22.8h, v25.8h, v3.h[2]\n"
"ldr d25, [x17, #0xc0]\n"
"fmla v11.8h, v24.8h, v0.h[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[2]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.8h, v24.8h, v2.h[2]\n"
- "mov v25.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v23.8h, v24.8h, v3.h[2]\n"
"ldr d24, [x17, #0xd0]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[3]\n"
- "ldr x20, [x17, #0xe8]\n"
"fmla v12.8h, v25.8h, v1.h[3]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.8h, v25.8h, v2.h[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v20.8h, v25.8h, v3.h[3]\n"
"ldr d25, [x17, #0xe0]\n"
"fmla v9.8h, v24.8h, v0.h[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
"fmla v17.8h, v24.8h, v2.h[3]\n"
- "mov v25.d[1], x20\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v21.8h, v24.8h, v3.h[3]\n"
"ldr d24, [x17, #0xf0]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[3]\n"
- "ldr x20, [x17, #0x108]\n"
"fmla v14.8h, v25.8h, v1.h[3]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x118]\n"
"fmla v18.8h, v25.8h, v2.h[3]\n"
"fmla v22.8h, v25.8h, v3.h[3]\n"
"ldr d25, [x17, #0x100]\n"
"fmla v11.8h, v24.8h, v0.h[3]\n"
- "ldr x21, [x17, #0x118]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
"fmla v19.8h, v24.8h, v2.h[3]\n"
- "mov v25.d[1], x20\n"
"fmla v23.8h, v24.8h, v3.h[3]\n"
"ldr d24, [x17, #0x110]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[4]\n"
- "ldr x20, [x17, #0x128]\n"
"fmla v12.8h, v25.8h, v1.h[4]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x138]\n"
"fmla v16.8h, v25.8h, v2.h[4]\n"
"fmla v20.8h, v25.8h, v3.h[4]\n"
"ldr d25, [x17, #0x120]\n"
"fmla v9.8h, v24.8h, v0.h[4]\n"
- "ldr x21, [x17, #0x138]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
"fmla v17.8h, v24.8h, v2.h[4]\n"
- "mov v25.d[1], x20\n"
"fmla v21.8h, v24.8h, v3.h[4]\n"
"ldr d24, [x17, #0x130]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[4]\n"
- "ldr x20, [x17, #0x148]\n"
"fmla v14.8h, v25.8h, v1.h[4]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x158]\n"
"fmla v18.8h, v25.8h, v2.h[4]\n"
"fmla v22.8h, v25.8h, v3.h[4]\n"
"ldr d25, [x17, #0x140]\n"
"fmla v11.8h, v24.8h, v0.h[4]\n"
- "ldr x21, [x17, #0x158]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
"fmla v19.8h, v24.8h, v2.h[4]\n"
- "mov v25.d[1], x20\n"
"fmla v23.8h, v24.8h, v3.h[4]\n"
"ldr d24, [x17, #0x150]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[5]\n"
- "ldr x20, [x17, #0x168]\n"
"fmla v12.8h, v25.8h, v1.h[5]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x178]\n"
"fmla v16.8h, v25.8h, v2.h[5]\n"
"fmla v20.8h, v25.8h, v3.h[5]\n"
"ldr d25, [x17, #0x160]\n"
"fmla v9.8h, v24.8h, v0.h[5]\n"
- "ldr x21, [x17, #0x178]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
"fmla v17.8h, v24.8h, v2.h[5]\n"
- "mov v25.d[1], x20\n"
"fmla v21.8h, v24.8h, v3.h[5]\n"
"ldr d24, [x17, #0x170]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[5]\n"
- "ldr x20, [x17, #0x188]\n"
"fmla v14.8h, v25.8h, v1.h[5]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x198]\n"
"fmla v18.8h, v25.8h, v2.h[5]\n"
"fmla v22.8h, v25.8h, v3.h[5]\n"
"ldr d25, [x17, #0x180]\n"
"fmla v11.8h, v24.8h, v0.h[5]\n"
- "ldr x21, [x17, #0x198]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
"fmla v19.8h, v24.8h, v2.h[5]\n"
- "mov v25.d[1], x20\n"
"fmla v23.8h, v24.8h, v3.h[5]\n"
"ldr d24, [x17, #0x190]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[6]\n"
- "ldr x20, [x17, #0x1a8]\n"
"fmla v12.8h, v25.8h, v1.h[6]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x1b8]\n"
"fmla v16.8h, v25.8h, v2.h[6]\n"
"fmla v20.8h, v25.8h, v3.h[6]\n"
"ldr d25, [x17, #0x1a0]\n"
"fmla v9.8h, v24.8h, v0.h[6]\n"
- "ldr x21, [x17, #0x1b8]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
"fmla v17.8h, v24.8h, v2.h[6]\n"
- "mov v25.d[1], x20\n"
"fmla v21.8h, v24.8h, v3.h[6]\n"
"ldr d24, [x17, #0x1b0]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.8h, v25.8h, v0.h[6]\n"
- "ldr x20, [x17, #0x1c8]\n"
"fmla v14.8h, v25.8h, v1.h[6]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x1d8]\n"
"fmla v18.8h, v25.8h, v2.h[6]\n"
"fmla v22.8h, v25.8h, v3.h[6]\n"
"ldr d25, [x17, #0x1c0]\n"
"fmla v11.8h, v24.8h, v0.h[6]\n"
- "ldr x21, [x17, #0x1d8]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.8h, v24.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
"fmla v19.8h, v24.8h, v2.h[6]\n"
- "mov v25.d[1], x20\n"
"fmla v23.8h, v24.8h, v3.h[6]\n"
"ldr d24, [x17, #0x1d0]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.8h, v25.8h, v0.h[7]\n"
- "ldr x20, [x17, #0x1e8]\n"
"fmla v12.8h, v25.8h, v1.h[7]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x1f8]\n"
"fmla v16.8h, v25.8h, v2.h[7]\n"
"fmla v20.8h, v25.8h, v3.h[7]\n"
"ldr d25, [x17, #0x1e0]\n"
"fmla v9.8h, v24.8h, v0.h[7]\n"
- "ldr x21, [x17, #0x1f8]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v24.8h, v1.h[7]\n"
"fmla v17.8h, v24.8h, v2.h[7]\n"
- "mov v25.d[1], x20\n"
"fmla v21.8h, v24.8h, v3.h[7]\n"
"ldr d24, [x17, #0x1f0]\n"
+ "mov v24.d[1], x20\n"
"add x17, x17, #0x200\n"
"fmla v10.8h, v25.8h, v0.h[7]\n"
+ "ldr x21, [x17, #0x8]\n"
"fmla v14.8h, v25.8h, v1.h[7]\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.8h, v25.8h, v2.h[7]\n"
"fmla v22.8h, v25.8h, v3.h[7]\n"
"ldr d6, [x17, #0x0]\n"
@@ -2698,8 +2697,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v23.8h, v24.8h, v3.h[7]\n"
"ldr d3, [x10, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x17, #0x18]\n"
+ "mov v6.d[1], x21\n"
"mov v0.d[1], x25\n"
"mov v1.d[1], x24\n"
"mov v2.d[1], x23\n"
@@ -2884,8 +2882,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr h1, [x11], #0x2\n"
"ldr h0, [x10], #0x2\n"
"ldr q25, [x17, #0x0]\n"
- "ldr q24, [x17, #0x10]\n"
"fmla v8.8h, v25.8h, v3.h[0]\n"
+ "ldr q24, [x17, #0x10]\n"
"fmla v12.8h, v25.8h, v2.h[0]\n"
"fmla v16.8h, v25.8h, v1.h[0]\n"
"fmla v20.8h, v25.8h, v0.h[0]\n"
@@ -2911,34 +2909,34 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"cmp x15, x20\n"
"bne 170b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x16, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
+ "prfm pstl1keep, [x16, #0x0]\n"
"prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 178f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v24.8h\n"
+ "fmin v9.8h, v9.8h, v24.8h\n"
+ "fmin v10.8h, v10.8h, v24.8h\n"
+ "fmin v11.8h, v11.8h, v24.8h\n"
+ "fmin v12.8h, v12.8h, v24.8h\n"
+ "fmin v13.8h, v13.8h, v24.8h\n"
+ "fmin v14.8h, v14.8h, v24.8h\n"
+ "fmin v15.8h, v15.8h, v24.8h\n"
+ "fmin v16.8h, v16.8h, v24.8h\n"
+ "fmin v17.8h, v17.8h, v24.8h\n"
+ "fmin v18.8h, v18.8h, v24.8h\n"
+ "fmin v19.8h, v19.8h, v24.8h\n"
+ "fmin v20.8h, v20.8h, v24.8h\n"
+ "fmin v21.8h, v21.8h, v24.8h\n"
+ "fmin v22.8h, v22.8h, v24.8h\n"
+ "fmin v23.8h, v23.8h, v24.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.8h }, [x21]\n"
"ld1r { v24.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v25.8h\n"
- "fmin v9.8h, v9.8h, v25.8h\n"
- "fmin v10.8h, v10.8h, v25.8h\n"
- "fmin v11.8h, v11.8h, v25.8h\n"
- "fmin v12.8h, v12.8h, v25.8h\n"
- "fmin v13.8h, v13.8h, v25.8h\n"
- "fmin v14.8h, v14.8h, v25.8h\n"
- "fmin v15.8h, v15.8h, v25.8h\n"
- "fmin v16.8h, v16.8h, v25.8h\n"
- "fmin v17.8h, v17.8h, v25.8h\n"
- "fmin v18.8h, v18.8h, v25.8h\n"
- "fmin v19.8h, v19.8h, v25.8h\n"
- "fmin v20.8h, v20.8h, v25.8h\n"
- "fmin v21.8h, v21.8h, v25.8h\n"
- "fmin v22.8h, v22.8h, v25.8h\n"
- "fmin v23.8h, v23.8h, v25.8h\n"
"fmax v8.8h, v8.8h, v24.8h\n"
"fmax v9.8h, v9.8h, v24.8h\n"
"fmax v10.8h, v10.8h, v24.8h\n"
@@ -2961,191 +2959,191 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"tbz x8, #4, 186f\n"
"st1 { v8.8h }, [x16], #0x10\n"
"st1 { v9.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
"tbz x8, #3, 182f\n"
"st1 { v10.8h }, [x16], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
"tbz x8, #2, 180f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x8, #1, 179f\n"
"st1 { v11.s }[2], [x16], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v11.h }[6], [x16]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
"b 194f\n"
"179:" // Height 4: Partial direct writeback: partial_1_28
"tbz x8, #0, 194f\n"
"st1 { v11.h }[4], [x16]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
"b 194f\n"
"180:" // Height 4: Partial direct writeback: partial_2_24
"tbz x8, #1, 181f\n"
"str s11, [x16], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v11.h }[2], [x16]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
"b 194f\n"
"181:" // Height 4: Partial direct writeback: partial_1_24
"tbz x8, #0, 194f\n"
"str h11, [x16, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
"b 194f\n"
"182:" // Height 4: Partial direct writeback: partial_4_16
"tbz x8, #2, 184f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x8, #1, 183f\n"
"st1 { v10.s }[2], [x16], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v10.h }[6], [x16]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
"b 194f\n"
"183:" // Height 4: Partial direct writeback: partial_1_20
"tbz x8, #0, 194f\n"
"st1 { v10.h }[4], [x16]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
"b 194f\n"
"184:" // Height 4: Partial direct writeback: partial_2_16
"tbz x8, #1, 185f\n"
"str s10, [x16], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v10.h }[2], [x16]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
"b 194f\n"
"185:" // Height 4: Partial direct writeback: partial_1_16
"tbz x8, #0, 194f\n"
"str h10, [x16, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
"b 194f\n"
"186:" // Height 4: Partial direct writeback: partial_8_0
"tbz x8, #3, 190f\n"
"st1 { v8.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
"tbz x8, #2, 188f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x8, #1, 187f\n"
"st1 { v9.s }[2], [x16], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v9.h }[6], [x16]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
"b 194f\n"
"187:" // Height 4: Partial direct writeback: partial_1_12
"tbz x8, #0, 194f\n"
"st1 { v9.h }[4], [x16]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
"b 194f\n"
"188:" // Height 4: Partial direct writeback: partial_2_8
"tbz x8, #1, 189f\n"
"str s9, [x16], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v9.h }[2], [x16]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
"b 194f\n"
"189:" // Height 4: Partial direct writeback: partial_1_8
"tbz x8, #0, 194f\n"
"str h9, [x16, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
"b 194f\n"
"190:" // Height 4: Partial direct writeback: partial_4_0
"tbz x8, #2, 192f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x8, #1, 191f\n"
"st1 { v8.s }[2], [x16], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v8.h }[6], [x16]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
"b 194f\n"
"191:" // Height 4: Partial direct writeback: partial_1_4
"tbz x8, #0, 194f\n"
"st1 { v8.h }[4], [x16]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
"b 194f\n"
"192:" // Height 4: Partial direct writeback: partial_2_0
"tbz x8, #1, 193f\n"
"str s8, [x16], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x8, #0, 194f\n"
"st1 { v8.h }[2], [x16]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
"b 194f\n"
"193:" // Height 4: Partial direct writeback: partial_1_0
"str h8, [x16, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
"194:" // Height 4: Partial direct writeback: Done
"b 196f\n"
"195:" // Height 4: Full writeback
@@ -3154,39 +3152,39 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"196:" // Height 4: Writeback done
"subs x8, x8, #0x20\n"
"bgt 149b\n"
"b 296f\n"
"197:" // Height 5
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"198:" // Height 5: Column loop
"cbz x7, 199f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -3202,248 +3200,248 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"199:" // Height 5: no bias
"tbz %x[flags], #0, 217f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x20\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x16, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "cmp x8, #0x20\n"
+ "add x22, x23, x20, LSL #1\n"
"bge 216f\n"
"tbz x8, #4, 207f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
"ld1 { v9.8h }, [x16], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
"tbz x8, #3, 203f\n"
"ld1 { v10.8h }, [x16], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
"tbz x8, #2, 201f\n"
"ldr d11, [x16], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x8, #1, 200f\n"
"ld1 { v11.s }[2], [x16], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v11.h }[6], [x16]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
"b 215f\n"
"200:" // Height 5: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x8, #0, 215f\n"
"ld1 { v11.h }[4], [x16]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
"b 215f\n"
"201:" // Height 5: Partial accumulate: partial_2_24
"tbz x8, #1, 202f\n"
"ldr s11, [x16], #0x4\n"
"mov x20, #0x34\n"
- "ldr s15, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v11.h }[2], [x16]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
"b 215f\n"
"202:" // Height 5: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x8, #0, 215f\n"
"ldr h11, [x16, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
"b 215f\n"
"203:" // Height 5: Partial accumulate: partial_4_16
"tbz x8, #2, 205f\n"
"ldr d10, [x16], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x8, #1, 204f\n"
"ld1 { v10.s }[2], [x16], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v10.h }[6], [x16]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
"b 215f\n"
"204:" // Height 5: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x8, #0, 215f\n"
"ld1 { v10.h }[4], [x16]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
"b 215f\n"
"205:" // Height 5: Partial accumulate: partial_2_16
"tbz x8, #1, 206f\n"
"ldr s10, [x16], #0x4\n"
"mov x20, #0x24\n"
- "ldr s14, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v10.h }[2], [x16]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
"b 215f\n"
"206:" // Height 5: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x8, #0, 215f\n"
"ldr h10, [x16, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
"b 215f\n"
"207:" // Height 5: Partial accumulate: partial_8_0
"tbz x8, #3, 211f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
"tbz x8, #2, 209f\n"
"ldr d9, [x16], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x8, #1, 208f\n"
"ld1 { v9.s }[2], [x16], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v9.h }[6], [x16]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
"b 215f\n"
"208:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x8, #0, 215f\n"
"ld1 { v9.h }[4], [x16]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
"b 215f\n"
"209:" // Height 5: Partial accumulate: partial_2_8
"tbz x8, #1, 210f\n"
"ldr s9, [x16], #0x4\n"
"mov x20, #0x14\n"
- "ldr s13, [x26], #0x4\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v9.h }[2], [x16]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
"b 215f\n"
"210:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x8, #0, 215f\n"
"ldr h9, [x16, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
"b 215f\n"
"211:" // Height 5: Partial accumulate: partial_4_0
"tbz x8, #2, 213f\n"
"ldr d8, [x16], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x8, #1, 212f\n"
"ld1 { v8.s }[2], [x16], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v8.h }[6], [x16]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
"b 215f\n"
"212:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x8, #0, 215f\n"
"ld1 { v8.h }[4], [x16]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
"b 215f\n"
"213:" // Height 5: Partial accumulate: partial_2_0
"tbz x8, #1, 214f\n"
"ldr s8, [x16], #0x4\n"
"mov x20, #0x4\n"
- "ldr s12, [x26], #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
"tbz x8, #0, 215f\n"
"ld1 { v8.h }[2], [x16]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
"b 215f\n"
"214:" // Height 5: Partial accumulate: partial_1_0
"ldr h8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h12, [x26, #0x0]\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
"215:" // Height 5: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 218f\n"
@@ -3452,22 +3450,22 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 218f\n"
"217:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -3494,8 +3492,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"mov x15, #0x0\n"
"219:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 220f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3542,259 +3540,259 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v24.8h, v6.8h, v4.h[0]\n"
"ldr d29, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x11, x11, #0x10\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "mov v29.d[1], x21\n"
+ "add x11, x11, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x10, x10, #0x10\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"ldr d28, [x17, #0x30]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[0]\n"
- "add x9, x9, #0x10\n"
"fmla v14.8h, v29.8h, v1.h[0]\n"
- "ldr x26, [x13, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.8h, v29.8h, v2.h[0]\n"
+ "add x9, x9, #0x10\n"
"fmla v22.8h, v29.8h, v3.h[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x13, #0x8]\n"
"fmla v26.8h, v29.8h, v4.h[0]\n"
"ldr d29, [x17, #0x40]\n"
"fmla v11.8h, v28.8h, v0.h[0]\n"
- "ldr x25, [x12, #0x8]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[0]\n"
- "ldr x24, [x11, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.8h, v28.8h, v2.h[0]\n"
- "mov v29.d[1], x21\n"
+ "ldr x25, [x12, #0x8]\n"
"fmla v23.8h, v28.8h, v3.h[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x24, [x11, #0x8]\n"
"fmla v27.8h, v28.8h, v4.h[0]\n"
"ldr d28, [x17, #0x50]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[1]\n"
- "ldr x23, [x10, #0x8]\n"
"fmla v12.8h, v29.8h, v1.h[1]\n"
- "ldr x22, [x9, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.8h, v29.8h, v2.h[1]\n"
+ "ldr x23, [x10, #0x8]\n"
"fmla v20.8h, v29.8h, v3.h[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "ldr x22, [x9, #0x8]\n"
"fmla v24.8h, v29.8h, v4.h[1]\n"
"ldr d29, [x17, #0x60]\n"
"fmla v9.8h, v28.8h, v0.h[1]\n"
- "sub x14, x14, #0x8\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[1]\n"
- "cmp x14, #0x10\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.8h, v28.8h, v2.h[1]\n"
- "mov v29.d[1], x21\n"
+ "sub x14, x14, #0x8\n"
"fmla v21.8h, v28.8h, v3.h[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "cmp x14, #0x10\n"
"fmla v25.8h, v28.8h, v4.h[1]\n"
"ldr d28, [x17, #0x70]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[1]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"fmla v14.8h, v29.8h, v1.h[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.8h, v29.8h, v2.h[1]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v22.8h, v29.8h, v3.h[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v26.8h, v29.8h, v4.h[1]\n"
"ldr d29, [x17, #0x80]\n"
"fmla v11.8h, v28.8h, v0.h[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.8h, v28.8h, v2.h[1]\n"
- "mov v29.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v23.8h, v28.8h, v3.h[1]\n"
- "ldr x21, [x17, #0xa8]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v27.8h, v28.8h, v4.h[1]\n"
"ldr d28, [x17, #0x90]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
"fmla v12.8h, v29.8h, v1.h[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.8h, v29.8h, v2.h[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v20.8h, v29.8h, v3.h[2]\n"
- "ldr x20, [x17, #0xb8]\n"
"fmla v24.8h, v29.8h, v4.h[2]\n"
"ldr d29, [x17, #0xa0]\n"
"fmla v9.8h, v28.8h, v0.h[2]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.8h, v28.8h, v2.h[2]\n"
- "mov v29.d[1], x21\n"
"fmla v21.8h, v28.8h, v3.h[2]\n"
- "ldr x21, [x17, #0xc8]\n"
"fmla v25.8h, v28.8h, v4.h[2]\n"
"ldr d28, [x17, #0xb0]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[2]\n"
"fmla v14.8h, v29.8h, v1.h[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.8h, v29.8h, v2.h[2]\n"
"fmla v22.8h, v29.8h, v3.h[2]\n"
- "ldr x20, [x17, #0xd8]\n"
"fmla v26.8h, v29.8h, v4.h[2]\n"
"ldr d29, [x17, #0xc0]\n"
"fmla v11.8h, v28.8h, v0.h[2]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.8h, v28.8h, v2.h[2]\n"
- "mov v29.d[1], x21\n"
"fmla v23.8h, v28.8h, v3.h[2]\n"
- "ldr x21, [x17, #0xe8]\n"
"fmla v27.8h, v28.8h, v4.h[2]\n"
"ldr d28, [x17, #0xd0]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[3]\n"
"fmla v12.8h, v29.8h, v1.h[3]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.8h, v29.8h, v2.h[3]\n"
"fmla v20.8h, v29.8h, v3.h[3]\n"
- "ldr x20, [x17, #0xf8]\n"
"fmla v24.8h, v29.8h, v4.h[3]\n"
"ldr d29, [x17, #0xe0]\n"
"fmla v9.8h, v28.8h, v0.h[3]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
"fmla v17.8h, v28.8h, v2.h[3]\n"
- "mov v29.d[1], x21\n"
"fmla v21.8h, v28.8h, v3.h[3]\n"
- "ldr x21, [x17, #0x108]\n"
"fmla v25.8h, v28.8h, v4.h[3]\n"
"ldr d28, [x17, #0xf0]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[3]\n"
"fmla v14.8h, v29.8h, v1.h[3]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x118]\n"
"fmla v18.8h, v29.8h, v2.h[3]\n"
"fmla v22.8h, v29.8h, v3.h[3]\n"
- "ldr x20, [x17, #0x118]\n"
"fmla v26.8h, v29.8h, v4.h[3]\n"
"ldr d29, [x17, #0x100]\n"
"fmla v11.8h, v28.8h, v0.h[3]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
"fmla v19.8h, v28.8h, v2.h[3]\n"
- "mov v29.d[1], x21\n"
"fmla v23.8h, v28.8h, v3.h[3]\n"
- "ldr x21, [x17, #0x128]\n"
"fmla v27.8h, v28.8h, v4.h[3]\n"
"ldr d28, [x17, #0x110]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[4]\n"
"fmla v12.8h, v29.8h, v1.h[4]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x138]\n"
"fmla v16.8h, v29.8h, v2.h[4]\n"
"fmla v20.8h, v29.8h, v3.h[4]\n"
- "ldr x20, [x17, #0x138]\n"
"fmla v24.8h, v29.8h, v4.h[4]\n"
"ldr d29, [x17, #0x120]\n"
"fmla v9.8h, v28.8h, v0.h[4]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
"fmla v17.8h, v28.8h, v2.h[4]\n"
- "mov v29.d[1], x21\n"
"fmla v21.8h, v28.8h, v3.h[4]\n"
- "ldr x21, [x17, #0x148]\n"
"fmla v25.8h, v28.8h, v4.h[4]\n"
"ldr d28, [x17, #0x130]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[4]\n"
"fmla v14.8h, v29.8h, v1.h[4]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x158]\n"
"fmla v18.8h, v29.8h, v2.h[4]\n"
"fmla v22.8h, v29.8h, v3.h[4]\n"
- "ldr x20, [x17, #0x158]\n"
"fmla v26.8h, v29.8h, v4.h[4]\n"
"ldr d29, [x17, #0x140]\n"
"fmla v11.8h, v28.8h, v0.h[4]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
"fmla v19.8h, v28.8h, v2.h[4]\n"
- "mov v29.d[1], x21\n"
"fmla v23.8h, v28.8h, v3.h[4]\n"
- "ldr x21, [x17, #0x168]\n"
"fmla v27.8h, v28.8h, v4.h[4]\n"
"ldr d28, [x17, #0x150]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[5]\n"
"fmla v12.8h, v29.8h, v1.h[5]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x178]\n"
"fmla v16.8h, v29.8h, v2.h[5]\n"
"fmla v20.8h, v29.8h, v3.h[5]\n"
- "ldr x20, [x17, #0x178]\n"
"fmla v24.8h, v29.8h, v4.h[5]\n"
"ldr d29, [x17, #0x160]\n"
"fmla v9.8h, v28.8h, v0.h[5]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
"fmla v17.8h, v28.8h, v2.h[5]\n"
- "mov v29.d[1], x21\n"
"fmla v21.8h, v28.8h, v3.h[5]\n"
- "ldr x21, [x17, #0x188]\n"
"fmla v25.8h, v28.8h, v4.h[5]\n"
"ldr d28, [x17, #0x170]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[5]\n"
"fmla v14.8h, v29.8h, v1.h[5]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x198]\n"
"fmla v18.8h, v29.8h, v2.h[5]\n"
"fmla v22.8h, v29.8h, v3.h[5]\n"
- "ldr x20, [x17, #0x198]\n"
"fmla v26.8h, v29.8h, v4.h[5]\n"
"ldr d29, [x17, #0x180]\n"
"fmla v11.8h, v28.8h, v0.h[5]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
"fmla v19.8h, v28.8h, v2.h[5]\n"
- "mov v29.d[1], x21\n"
"fmla v23.8h, v28.8h, v3.h[5]\n"
- "ldr x21, [x17, #0x1a8]\n"
"fmla v27.8h, v28.8h, v4.h[5]\n"
"ldr d28, [x17, #0x190]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[6]\n"
"fmla v12.8h, v29.8h, v1.h[6]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x1b8]\n"
"fmla v16.8h, v29.8h, v2.h[6]\n"
"fmla v20.8h, v29.8h, v3.h[6]\n"
- "ldr x20, [x17, #0x1b8]\n"
"fmla v24.8h, v29.8h, v4.h[6]\n"
"ldr d29, [x17, #0x1a0]\n"
"fmla v9.8h, v28.8h, v0.h[6]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
"fmla v17.8h, v28.8h, v2.h[6]\n"
- "mov v29.d[1], x21\n"
"fmla v21.8h, v28.8h, v3.h[6]\n"
- "ldr x21, [x17, #0x1c8]\n"
"fmla v25.8h, v28.8h, v4.h[6]\n"
"ldr d28, [x17, #0x1b0]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.8h, v29.8h, v0.h[6]\n"
"fmla v14.8h, v29.8h, v1.h[6]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x1d8]\n"
"fmla v18.8h, v29.8h, v2.h[6]\n"
"fmla v22.8h, v29.8h, v3.h[6]\n"
- "ldr x20, [x17, #0x1d8]\n"
"fmla v26.8h, v29.8h, v4.h[6]\n"
"ldr d29, [x17, #0x1c0]\n"
"fmla v11.8h, v28.8h, v0.h[6]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.8h, v28.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
"fmla v19.8h, v28.8h, v2.h[6]\n"
- "mov v29.d[1], x21\n"
"fmla v23.8h, v28.8h, v3.h[6]\n"
- "ldr x21, [x17, #0x1e8]\n"
"fmla v27.8h, v28.8h, v4.h[6]\n"
"ldr d28, [x17, #0x1d0]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.8h, v29.8h, v0.h[7]\n"
"fmla v12.8h, v29.8h, v1.h[7]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x1f8]\n"
"fmla v16.8h, v29.8h, v2.h[7]\n"
"fmla v20.8h, v29.8h, v3.h[7]\n"
- "ldr x20, [x17, #0x1f8]\n"
"fmla v24.8h, v29.8h, v4.h[7]\n"
"ldr d29, [x17, #0x1e0]\n"
"fmla v9.8h, v28.8h, v0.h[7]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v28.8h, v1.h[7]\n"
"fmla v17.8h, v28.8h, v2.h[7]\n"
- "mov v29.d[1], x21\n"
"fmla v21.8h, v28.8h, v3.h[7]\n"
"fmla v25.8h, v28.8h, v4.h[7]\n"
"ldr d28, [x17, #0x1f0]\n"
+ "mov v28.d[1], x20\n"
"add x17, x17, #0x200\n"
"fmla v10.8h, v29.8h, v0.h[7]\n"
- "fmla v14.8h, v29.8h, v1.h[7]\n"
"ldr x21, [x17, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.8h, v29.8h, v2.h[7]\n"
"fmla v22.8h, v29.8h, v3.h[7]\n"
- "ldr x20, [x17, #0x18]\n"
"fmla v26.8h, v29.8h, v4.h[7]\n"
"ldr d6, [x17, #0x0]\n"
"fmla v11.8h, v28.8h, v0.h[7]\n"
@@ -4029,8 +4027,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr h1, [x10], #0x2\n"
"ldr h0, [x9], #0x2\n"
"ldr q29, [x17, #0x0]\n"
- "ldr q28, [x17, #0x10]\n"
"fmla v8.8h, v29.8h, v4.h[0]\n"
+ "ldr q28, [x17, #0x10]\n"
"fmla v12.8h, v29.8h, v3.h[0]\n"
"fmla v16.8h, v29.8h, v2.h[0]\n"
"fmla v20.8h, v29.8h, v1.h[0]\n"
@@ -4060,40 +4058,40 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"cmp x15, x20\n"
"bne 219b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x16, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
+ "prfm pstl1keep, [x16, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 227f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v28.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v28.8h\n"
+ "fmin v9.8h, v9.8h, v28.8h\n"
+ "fmin v10.8h, v10.8h, v28.8h\n"
+ "fmin v11.8h, v11.8h, v28.8h\n"
+ "fmin v12.8h, v12.8h, v28.8h\n"
+ "fmin v13.8h, v13.8h, v28.8h\n"
+ "fmin v14.8h, v14.8h, v28.8h\n"
+ "fmin v15.8h, v15.8h, v28.8h\n"
+ "fmin v16.8h, v16.8h, v28.8h\n"
+ "fmin v17.8h, v17.8h, v28.8h\n"
+ "fmin v18.8h, v18.8h, v28.8h\n"
+ "fmin v19.8h, v19.8h, v28.8h\n"
+ "fmin v20.8h, v20.8h, v28.8h\n"
+ "fmin v21.8h, v21.8h, v28.8h\n"
+ "fmin v22.8h, v22.8h, v28.8h\n"
+ "fmin v23.8h, v23.8h, v28.8h\n"
+ "fmin v24.8h, v24.8h, v28.8h\n"
+ "fmin v25.8h, v25.8h, v28.8h\n"
+ "fmin v26.8h, v26.8h, v28.8h\n"
+ "fmin v27.8h, v27.8h, v28.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.8h }, [x21]\n"
"ld1r { v28.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v29.8h\n"
- "fmin v9.8h, v9.8h, v29.8h\n"
- "fmin v10.8h, v10.8h, v29.8h\n"
- "fmin v11.8h, v11.8h, v29.8h\n"
- "fmin v12.8h, v12.8h, v29.8h\n"
- "fmin v13.8h, v13.8h, v29.8h\n"
- "fmin v14.8h, v14.8h, v29.8h\n"
- "fmin v15.8h, v15.8h, v29.8h\n"
- "fmin v16.8h, v16.8h, v29.8h\n"
- "fmin v17.8h, v17.8h, v29.8h\n"
- "fmin v18.8h, v18.8h, v29.8h\n"
- "fmin v19.8h, v19.8h, v29.8h\n"
- "fmin v20.8h, v20.8h, v29.8h\n"
- "fmin v21.8h, v21.8h, v29.8h\n"
- "fmin v22.8h, v22.8h, v29.8h\n"
- "fmin v23.8h, v23.8h, v29.8h\n"
- "fmin v24.8h, v24.8h, v29.8h\n"
- "fmin v25.8h, v25.8h, v29.8h\n"
- "fmin v26.8h, v26.8h, v29.8h\n"
- "fmin v27.8h, v27.8h, v29.8h\n"
"fmax v8.8h, v8.8h, v28.8h\n"
"fmax v9.8h, v9.8h, v28.8h\n"
"fmax v10.8h, v10.8h, v28.8h\n"
@@ -4120,223 +4118,223 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"tbz x8, #4, 235f\n"
"st1 { v8.8h }, [x16], #0x10\n"
"st1 { v9.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
"tbz x8, #3, 231f\n"
"st1 { v10.8h }, [x16], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
"tbz x8, #2, 229f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x8, #1, 228f\n"
"st1 { v11.s }[2], [x16], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v11.h }[6], [x16]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
- "st1 { v27.h }[6], [x23]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
"b 243f\n"
"228:" // Height 5: Partial direct writeback: partial_1_28
"tbz x8, #0, 243f\n"
"st1 { v11.h }[4], [x16]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
- "st1 { v27.h }[4], [x23]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
"b 243f\n"
"229:" // Height 5: Partial direct writeback: partial_2_24
"tbz x8, #1, 230f\n"
"str s11, [x16], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
- "str s27, [x23], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v11.h }[2], [x16]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
- "st1 { v27.h }[2], [x23]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
"b 243f\n"
"230:" // Height 5: Partial direct writeback: partial_1_24
"tbz x8, #0, 243f\n"
"str h11, [x16, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
- "str h27, [x23, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
"b 243f\n"
"231:" // Height 5: Partial direct writeback: partial_4_16
"tbz x8, #2, 233f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x8, #1, 232f\n"
"st1 { v10.s }[2], [x16], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v10.h }[6], [x16]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
- "st1 { v26.h }[6], [x23]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
"b 243f\n"
"232:" // Height 5: Partial direct writeback: partial_1_20
"tbz x8, #0, 243f\n"
"st1 { v10.h }[4], [x16]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
- "st1 { v26.h }[4], [x23]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
"b 243f\n"
"233:" // Height 5: Partial direct writeback: partial_2_16
"tbz x8, #1, 234f\n"
"str s10, [x16], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
- "str s26, [x23], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v10.h }[2], [x16]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
- "st1 { v26.h }[2], [x23]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
"b 243f\n"
"234:" // Height 5: Partial direct writeback: partial_1_16
"tbz x8, #0, 243f\n"
"str h10, [x16, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
- "str h26, [x23, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
"b 243f\n"
"235:" // Height 5: Partial direct writeback: partial_8_0
"tbz x8, #3, 239f\n"
"st1 { v8.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
"tbz x8, #2, 237f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x8, #1, 236f\n"
"st1 { v9.s }[2], [x16], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v9.h }[6], [x16]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
- "st1 { v25.h }[6], [x23]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
"b 243f\n"
"236:" // Height 5: Partial direct writeback: partial_1_12
"tbz x8, #0, 243f\n"
"st1 { v9.h }[4], [x16]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
- "st1 { v25.h }[4], [x23]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
"b 243f\n"
"237:" // Height 5: Partial direct writeback: partial_2_8
"tbz x8, #1, 238f\n"
"str s9, [x16], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
- "str s25, [x23], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v9.h }[2], [x16]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
- "st1 { v25.h }[2], [x23]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
"b 243f\n"
"238:" // Height 5: Partial direct writeback: partial_1_8
"tbz x8, #0, 243f\n"
"str h9, [x16, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
- "str h25, [x23, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
"b 243f\n"
"239:" // Height 5: Partial direct writeback: partial_4_0
"tbz x8, #2, 241f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x8, #1, 240f\n"
"st1 { v8.s }[2], [x16], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v8.h }[6], [x16]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
- "st1 { v24.h }[6], [x23]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
"b 243f\n"
"240:" // Height 5: Partial direct writeback: partial_1_4
"tbz x8, #0, 243f\n"
"st1 { v8.h }[4], [x16]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
- "st1 { v24.h }[4], [x23]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
"b 243f\n"
"241:" // Height 5: Partial direct writeback: partial_2_0
"tbz x8, #1, 242f\n"
"str s8, [x16], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x8, #0, 243f\n"
"st1 { v8.h }[2], [x16]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
- "st1 { v24.h }[2], [x23]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
"b 243f\n"
"242:" // Height 5: Partial direct writeback: partial_1_0
"str h8, [x16, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
- "str h24, [x23, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
"243:" // Height 5: Partial direct writeback: Done
"b 245f\n"
"244:" // Height 5: Full writeback
@@ -4345,22 +4343,22 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"245:" // Height 5: Writeback done
"subs x8, x8, #0x20\n"
"bgt 198b\n"
@@ -4368,24 +4366,23 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"246:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0xc\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "madd x20, x21, x20, x16\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"247:" // Height 6: Column loop
"cbz x7, 248f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -4405,281 +4402,281 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"248:" // Height 6: no bias
"tbz %x[flags], #0, 266f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x20\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x16, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "cmp x8, #0x20\n"
+ "add x21, x22, x20, LSL #1\n"
"bge 265f\n"
"tbz x8, #4, 256f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x22], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
"ld1 { v9.8h }, [x16], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
- "ld1 { v29.8h }, [x22], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
+ "ld1 { v29.8h }, [x21], #0x10\n"
"tbz x8, #3, 252f\n"
"ld1 { v10.8h }, [x16], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
- "ld1 { v30.8h }, [x22], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
+ "ld1 { v30.8h }, [x21], #0x10\n"
"tbz x8, #2, 250f\n"
"ldr d11, [x16], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x8, #1, 249f\n"
"ld1 { v11.s }[2], [x16], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v11.h }[6], [x16]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "ld1 { v31.h }[6], [x22]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
+ "ld1 { v31.h }[6], [x21]\n"
"b 264f\n"
"249:" // Height 6: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x8, #0, 264f\n"
"ld1 { v11.h }[4], [x16]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "ld1 { v31.h }[4], [x22]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
+ "ld1 { v31.h }[4], [x21]\n"
"b 264f\n"
"250:" // Height 6: Partial accumulate: partial_2_24
"tbz x8, #1, 251f\n"
"ldr s11, [x16], #0x4\n"
"mov x20, #0x34\n"
- "ldr s15, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v11.h }[2], [x16]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "ld1 { v31.h }[2], [x22]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
+ "ld1 { v31.h }[2], [x21]\n"
"b 264f\n"
"251:" // Height 6: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x8, #0, 264f\n"
"ldr h11, [x16, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "ldr h31, [x22, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
+ "ldr h31, [x21, #0x0]\n"
"b 264f\n"
"252:" // Height 6: Partial accumulate: partial_4_16
"tbz x8, #2, 254f\n"
"ldr d10, [x16], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x8, #1, 253f\n"
"ld1 { v10.s }[2], [x16], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v30.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
+ "ld1 { v30.s }[2], [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v10.h }[6], [x16]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
- "ld1 { v30.h }[6], [x22]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
+ "ld1 { v30.h }[6], [x21]\n"
"b 264f\n"
"253:" // Height 6: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x8, #0, 264f\n"
"ld1 { v10.h }[4], [x16]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
- "ld1 { v30.h }[4], [x22]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
+ "ld1 { v30.h }[4], [x21]\n"
"b 264f\n"
"254:" // Height 6: Partial accumulate: partial_2_16
"tbz x8, #1, 255f\n"
"ldr s10, [x16], #0x4\n"
"mov x20, #0x24\n"
- "ldr s14, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s30, [x22], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "ldr s30, [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v10.h }[2], [x16]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
- "ld1 { v30.h }[2], [x22]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "ld1 { v30.h }[2], [x21]\n"
"b 264f\n"
"255:" // Height 6: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x8, #0, 264f\n"
"ldr h10, [x16, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
- "ldr h30, [x22, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "ldr h30, [x21, #0x0]\n"
"b 264f\n"
"256:" // Height 6: Partial accumulate: partial_8_0
"tbz x8, #3, 260f\n"
"ld1 { v8.8h }, [x16], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x22], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
"tbz x8, #2, 258f\n"
"ldr d9, [x16], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x8, #1, 257f\n"
"ld1 { v9.s }[2], [x16], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
- "ld1 { v29.s }[2], [x22], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v29.s }[2], [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v9.h }[6], [x16]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
- "ld1 { v29.h }[6], [x22]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v29.h }[6], [x21]\n"
"b 264f\n"
"257:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x8, #0, 264f\n"
"ld1 { v9.h }[4], [x16]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
- "ld1 { v29.h }[4], [x22]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v29.h }[4], [x21]\n"
"b 264f\n"
"258:" // Height 6: Partial accumulate: partial_2_8
"tbz x8, #1, 259f\n"
"ldr s9, [x16], #0x4\n"
"mov x20, #0x14\n"
- "ldr s13, [x26], #0x4\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s29, [x22], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s29, [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v9.h }[2], [x16]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v29.h }[2], [x22]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v29.h }[2], [x21]\n"
"b 264f\n"
"259:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x8, #0, 264f\n"
"ldr h9, [x16, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h29, [x22, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h29, [x21, #0x0]\n"
"b 264f\n"
"260:" // Height 6: Partial accumulate: partial_4_0
"tbz x8, #2, 262f\n"
"ldr d8, [x16], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x8, #1, 261f\n"
"ld1 { v8.s }[2], [x16], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v8.h }[6], [x16]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v28.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v28.h }[6], [x21]\n"
"b 264f\n"
"261:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x8, #0, 264f\n"
"ld1 { v8.h }[4], [x16]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v28.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v28.h }[4], [x21]\n"
"b 264f\n"
"262:" // Height 6: Partial accumulate: partial_2_0
"tbz x8, #1, 263f\n"
"ldr s8, [x16], #0x4\n"
"mov x20, #0x4\n"
- "ldr s12, [x26], #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
"tbz x8, #0, 264f\n"
"ld1 { v8.h }[2], [x16]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v28.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v28.h }[2], [x21]\n"
"b 264f\n"
"263:" // Height 6: Partial accumulate: partial_1_0
"ldr h8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h12, [x26, #0x0]\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h28, [x22, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h28, [x21, #0x0]\n"
"264:" // Height 6: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 267f\n"
@@ -4688,26 +4685,26 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 267f\n"
"266:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -4738,8 +4735,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"mov x15, #0x0\n"
"268:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 269f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -4792,290 +4789,290 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v28.8h, v6.8h, v5.h[0]\n"
"ldr d6, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x10, x10, #0x10\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x9, x9, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "mov v6.d[1], x21\n"
+ "add x10, x10, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x9, x9, #0x10\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"add x28, x28, #0x10\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
"ldr d7, [x17, #0x30]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr x27, [x13, #0x8]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr x26, [x12, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
+ "ldr x27, [x13, #0x8]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x12, #0x8]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
"ldr x25, [x11, #0x8]\n"
"fmla v30.8h, v6.8h, v5.h[0]\n"
"ldr d6, [x17, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr x24, [x10, #0x8]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
- "mov v6.d[1], x21\n"
+ "ldr x24, [x10, #0x8]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x23, [x9, #0x8]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
"ldr x22, [x28, #0x8]\n"
"fmla v31.8h, v7.8h, v5.h[0]\n"
"ldr d7, [x17, #0x50]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
- "sub x14, x14, #0x8\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
- "cmp x14, #0x10\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
+ "sub x14, x14, #0x8\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "cmp x14, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v28.8h, v6.8h, v5.h[1]\n"
"ldr d6, [x17, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
- "mov v6.d[1], x21\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v25.8h, v7.8h, v4.h[1]\n"
"prfm pldl1keep, [x10, #0x80]\n"
"fmla v29.8h, v7.8h, v5.h[1]\n"
"ldr d7, [x17, #0x70]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.8h, v6.8h, v4.h[1]\n"
"fmla v30.8h, v6.8h, v5.h[1]\n"
"ldr d6, [x17, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
- "mov v6.d[1], x21\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr x21, [x17, #0xa8]\n"
"fmla v27.8h, v7.8h, v4.h[1]\n"
"fmla v31.8h, v7.8h, v5.h[1]\n"
"ldr d7, [x17, #0x90]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr x20, [x17, #0xb8]\n"
"fmla v24.8h, v6.8h, v4.h[2]\n"
"fmla v28.8h, v6.8h, v5.h[2]\n"
"ldr d6, [x17, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
- "mov v6.d[1], x21\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr x21, [x17, #0xc8]\n"
"fmla v25.8h, v7.8h, v4.h[2]\n"
"fmla v29.8h, v7.8h, v5.h[2]\n"
"ldr d7, [x17, #0xb0]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr x20, [x17, #0xd8]\n"
"fmla v26.8h, v6.8h, v4.h[2]\n"
"fmla v30.8h, v6.8h, v5.h[2]\n"
"ldr d6, [x17, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
- "mov v6.d[1], x21\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr x21, [x17, #0xe8]\n"
"fmla v27.8h, v7.8h, v4.h[2]\n"
"fmla v31.8h, v7.8h, v5.h[2]\n"
"ldr d7, [x17, #0xd0]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr x20, [x17, #0xf8]\n"
"fmla v24.8h, v6.8h, v4.h[3]\n"
"fmla v28.8h, v6.8h, v5.h[3]\n"
"ldr d6, [x17, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
- "mov v6.d[1], x21\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr x21, [x17, #0x108]\n"
"fmla v25.8h, v7.8h, v4.h[3]\n"
"fmla v29.8h, v7.8h, v5.h[3]\n"
"ldr d7, [x17, #0xf0]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x118]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr x20, [x17, #0x118]\n"
"fmla v26.8h, v6.8h, v4.h[3]\n"
"fmla v30.8h, v6.8h, v5.h[3]\n"
"ldr d6, [x17, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
- "mov v6.d[1], x21\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr x21, [x17, #0x128]\n"
"fmla v27.8h, v7.8h, v4.h[3]\n"
"fmla v31.8h, v7.8h, v5.h[3]\n"
"ldr d7, [x17, #0x110]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x138]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr x20, [x17, #0x138]\n"
"fmla v24.8h, v6.8h, v4.h[4]\n"
"fmla v28.8h, v6.8h, v5.h[4]\n"
"ldr d6, [x17, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
- "mov v6.d[1], x21\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr x21, [x17, #0x148]\n"
"fmla v25.8h, v7.8h, v4.h[4]\n"
"fmla v29.8h, v7.8h, v5.h[4]\n"
"ldr d7, [x17, #0x130]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x158]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr x20, [x17, #0x158]\n"
"fmla v26.8h, v6.8h, v4.h[4]\n"
"fmla v30.8h, v6.8h, v5.h[4]\n"
"ldr d6, [x17, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
- "mov v6.d[1], x21\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr x21, [x17, #0x168]\n"
"fmla v27.8h, v7.8h, v4.h[4]\n"
"fmla v31.8h, v7.8h, v5.h[4]\n"
"ldr d7, [x17, #0x150]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x178]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr x20, [x17, #0x178]\n"
"fmla v24.8h, v6.8h, v4.h[5]\n"
"fmla v28.8h, v6.8h, v5.h[5]\n"
"ldr d6, [x17, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
- "mov v6.d[1], x21\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr x21, [x17, #0x188]\n"
"fmla v25.8h, v7.8h, v4.h[5]\n"
"fmla v29.8h, v7.8h, v5.h[5]\n"
"ldr d7, [x17, #0x170]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x198]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr x20, [x17, #0x198]\n"
"fmla v26.8h, v6.8h, v4.h[5]\n"
"fmla v30.8h, v6.8h, v5.h[5]\n"
"ldr d6, [x17, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
- "mov v6.d[1], x21\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr x21, [x17, #0x1a8]\n"
"fmla v27.8h, v7.8h, v4.h[5]\n"
"fmla v31.8h, v7.8h, v5.h[5]\n"
"ldr d7, [x17, #0x190]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x1b8]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr x20, [x17, #0x1b8]\n"
"fmla v24.8h, v6.8h, v4.h[6]\n"
"fmla v28.8h, v6.8h, v5.h[6]\n"
"ldr d6, [x17, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
- "mov v6.d[1], x21\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr x21, [x17, #0x1c8]\n"
"fmla v25.8h, v7.8h, v4.h[6]\n"
"fmla v29.8h, v7.8h, v5.h[6]\n"
"ldr d7, [x17, #0x1b0]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x1d8]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr x20, [x17, #0x1d8]\n"
"fmla v26.8h, v6.8h, v4.h[6]\n"
"fmla v30.8h, v6.8h, v5.h[6]\n"
"ldr d6, [x17, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
- "mov v6.d[1], x21\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr x21, [x17, #0x1e8]\n"
"fmla v27.8h, v7.8h, v4.h[6]\n"
"fmla v31.8h, v7.8h, v5.h[6]\n"
"ldr d7, [x17, #0x1d0]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x1f8]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr x20, [x17, #0x1f8]\n"
"fmla v24.8h, v6.8h, v4.h[7]\n"
"fmla v28.8h, v6.8h, v5.h[7]\n"
"ldr d6, [x17, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
- "mov v6.d[1], x21\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
"fmla v25.8h, v7.8h, v4.h[7]\n"
"fmla v29.8h, v7.8h, v5.h[7]\n"
"ldr d7, [x17, #0x1f0]\n"
+ "mov v7.d[1], x20\n"
"add x17, x17, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
"ldr x21, [x17, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
- "ldr x20, [x17, #0x18]\n"
"fmla v26.8h, v6.8h, v4.h[7]\n"
"fmla v30.8h, v6.8h, v5.h[7]\n"
"ldr d6, [x17, #0x0]\n"
@@ -5349,8 +5346,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"ldr h3, [x9], #0x2\n"
"ldr h2, [x28], #0x2\n"
"ldr q1, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
"fmla v8.8h, v1.8h, v7.h[0]\n"
+ "ldr q0, [x17, #0x10]\n"
"fmla v12.8h, v1.8h, v6.h[0]\n"
"fmla v16.8h, v1.8h, v5.h[0]\n"
"fmla v20.8h, v1.8h, v4.h[0]\n"
@@ -5384,46 +5381,46 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"cmp x15, x20\n"
"bne 268b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x16, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add x23, x24, x20, LSL #1\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
+ "prfm pstl1keep, [x16, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 276f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmin v28.8h, v28.8h, v0.8h\n"
+ "fmin v29.8h, v29.8h, v0.8h\n"
+ "fmin v30.8h, v30.8h, v0.8h\n"
+ "fmin v31.8h, v31.8h, v0.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.8h }, [x21]\n"
"ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmin v20.8h, v20.8h, v1.8h\n"
- "fmin v21.8h, v21.8h, v1.8h\n"
- "fmin v22.8h, v22.8h, v1.8h\n"
- "fmin v23.8h, v23.8h, v1.8h\n"
- "fmin v24.8h, v24.8h, v1.8h\n"
- "fmin v25.8h, v25.8h, v1.8h\n"
- "fmin v26.8h, v26.8h, v1.8h\n"
- "fmin v27.8h, v27.8h, v1.8h\n"
- "fmin v28.8h, v28.8h, v1.8h\n"
- "fmin v29.8h, v29.8h, v1.8h\n"
- "fmin v30.8h, v30.8h, v1.8h\n"
- "fmin v31.8h, v31.8h, v1.8h\n"
"fmax v8.8h, v8.8h, v0.8h\n"
"fmax v9.8h, v9.8h, v0.8h\n"
"fmax v10.8h, v10.8h, v0.8h\n"
@@ -5454,255 +5451,255 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"tbz x8, #4, 284f\n"
"st1 { v8.8h }, [x16], #0x10\n"
"st1 { v9.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x22], #0x10\n"
- "st1 { v29.8h }, [x22], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "st1 { v29.8h }, [x21], #0x10\n"
"tbz x8, #3, 280f\n"
"st1 { v10.8h }, [x16], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
- "st1 { v30.8h }, [x22], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
+ "st1 { v30.8h }, [x21], #0x10\n"
"tbz x8, #2, 278f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x8, #1, 277f\n"
"st1 { v11.s }[2], [x16], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
- "st1 { v31.s }[2], [x22], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
+ "st1 { v31.s }[2], [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v11.h }[6], [x16]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
- "st1 { v27.h }[6], [x23]\n"
- "st1 { v31.h }[6], [x22]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
+ "st1 { v31.h }[6], [x21]\n"
"b 292f\n"
"277:" // Height 6: Partial direct writeback: partial_1_28
"tbz x8, #0, 292f\n"
"st1 { v11.h }[4], [x16]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
- "st1 { v27.h }[4], [x23]\n"
- "st1 { v31.h }[4], [x22]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
+ "st1 { v31.h }[4], [x21]\n"
"b 292f\n"
"278:" // Height 6: Partial direct writeback: partial_2_24
"tbz x8, #1, 279f\n"
"str s11, [x16], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
- "str s27, [x23], #0x4\n"
- "str s31, [x22], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
+ "str s31, [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v11.h }[2], [x16]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
- "st1 { v27.h }[2], [x23]\n"
- "st1 { v31.h }[2], [x22]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "st1 { v31.h }[2], [x21]\n"
"b 292f\n"
"279:" // Height 6: Partial direct writeback: partial_1_24
"tbz x8, #0, 292f\n"
"str h11, [x16, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
- "str h27, [x23, #0x0]\n"
- "str h31, [x22, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
+ "str h31, [x21, #0x0]\n"
"b 292f\n"
"280:" // Height 6: Partial direct writeback: partial_4_16
"tbz x8, #2, 282f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x8, #1, 281f\n"
"st1 { v10.s }[2], [x16], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
- "st1 { v30.s }[2], [x22], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v10.h }[6], [x16]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
- "st1 { v26.h }[6], [x23]\n"
- "st1 { v30.h }[6], [x22]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
"b 292f\n"
"281:" // Height 6: Partial direct writeback: partial_1_20
"tbz x8, #0, 292f\n"
"st1 { v10.h }[4], [x16]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
- "st1 { v26.h }[4], [x23]\n"
- "st1 { v30.h }[4], [x22]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
"b 292f\n"
"282:" // Height 6: Partial direct writeback: partial_2_16
"tbz x8, #1, 283f\n"
"str s10, [x16], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
- "str s26, [x23], #0x4\n"
- "str s30, [x22], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
+ "str s30, [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v10.h }[2], [x16]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
- "st1 { v26.h }[2], [x23]\n"
- "st1 { v30.h }[2], [x22]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
"b 292f\n"
"283:" // Height 6: Partial direct writeback: partial_1_16
"tbz x8, #0, 292f\n"
"str h10, [x16, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
- "str h26, [x23, #0x0]\n"
- "str h30, [x22, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
+ "str h30, [x21, #0x0]\n"
"b 292f\n"
"284:" // Height 6: Partial direct writeback: partial_8_0
"tbz x8, #3, 288f\n"
"st1 { v8.8h }, [x16], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x22], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
"tbz x8, #2, 286f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x8, #1, 285f\n"
"st1 { v9.s }[2], [x16], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
- "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
+ "st1 { v29.s }[2], [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v9.h }[6], [x16]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
- "st1 { v25.h }[6], [x23]\n"
- "st1 { v29.h }[6], [x22]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "st1 { v29.h }[6], [x21]\n"
"b 292f\n"
"285:" // Height 6: Partial direct writeback: partial_1_12
"tbz x8, #0, 292f\n"
"st1 { v9.h }[4], [x16]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
- "st1 { v25.h }[4], [x23]\n"
- "st1 { v29.h }[4], [x22]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "st1 { v29.h }[4], [x21]\n"
"b 292f\n"
"286:" // Height 6: Partial direct writeback: partial_2_8
"tbz x8, #1, 287f\n"
"str s9, [x16], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
- "str s25, [x23], #0x4\n"
- "str s29, [x22], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
+ "str s29, [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v9.h }[2], [x16]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
- "st1 { v25.h }[2], [x23]\n"
- "st1 { v29.h }[2], [x22]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "st1 { v29.h }[2], [x21]\n"
"b 292f\n"
"287:" // Height 6: Partial direct writeback: partial_1_8
"tbz x8, #0, 292f\n"
"str h9, [x16, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
- "str h25, [x23, #0x0]\n"
- "str h29, [x22, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
+ "str h29, [x21, #0x0]\n"
"b 292f\n"
"288:" // Height 6: Partial direct writeback: partial_4_0
"tbz x8, #2, 290f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x8, #1, 289f\n"
"st1 { v8.s }[2], [x16], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v8.h }[6], [x16]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
- "st1 { v24.h }[6], [x23]\n"
- "st1 { v28.h }[6], [x22]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
+ "st1 { v28.h }[6], [x21]\n"
"b 292f\n"
"289:" // Height 6: Partial direct writeback: partial_1_4
"tbz x8, #0, 292f\n"
"st1 { v8.h }[4], [x16]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
- "st1 { v24.h }[4], [x23]\n"
- "st1 { v28.h }[4], [x22]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
+ "st1 { v28.h }[4], [x21]\n"
"b 292f\n"
"290:" // Height 6: Partial direct writeback: partial_2_0
"tbz x8, #1, 291f\n"
"str s8, [x16], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x8, #0, 292f\n"
"st1 { v8.h }[2], [x16]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
- "st1 { v24.h }[2], [x23]\n"
- "st1 { v28.h }[2], [x22]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
+ "st1 { v28.h }[2], [x21]\n"
"b 292f\n"
"291:" // Height 6: Partial direct writeback: partial_1_0
"str h8, [x16, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
- "str h24, [x23, #0x0]\n"
- "str h28, [x22, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
+ "str h28, [x21, #0x0]\n"
"292:" // Height 6: Partial direct writeback: Done
"b 294f\n"
"293:" // Height 6: Full writeback
@@ -5711,26 +5708,26 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"294:" // Height 6: Writeback done
"subs x8, x8, #0x20\n"
"bgt 247b\n"
@@ -5746,8 +5743,8 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"296:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
index 4e81b724eb..978b344f1a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp16_mla_6x32 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp16_mla_6x32 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -103,10 +101,10 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp %x[M], #0x2\n"
"bgt 99f\n"
"beq 50f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x12, 3f\n"
"ldr q8, [x12, #0x0]\n"
@@ -245,8 +243,8 @@ void a64_hybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"23:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 24f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -270,10 +268,6 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q17, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x8\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x10, #0x40]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
@@ -330,21 +324,22 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q17, [x10, #0x1e0]\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
"ldr q16, [x10, #0x1f0]\n"
- "add x10, x10, #0x200\n"
+ "sub x27, x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
- "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
+ "cmp x27, #0x10\n"
+ "add x10, x10, #0x200\n"
+ "ldr q6, [x10, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 26b\n"
"27:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"ldr q17, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"ldr q17, [x10, #0x40]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
@@ -401,23 +396,26 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q17, [x10, #0x1e0]\n"
"fmla v9.8h, v16.8h, v0.h[7]\n"
"ldr q16, [x10, #0x1f0]\n"
- "add x10, x10, #0x200\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x8\n"
"fmla v10.8h, v17.8h, v0.h[7]\n"
"fmla v11.8h, v16.8h, v0.h[7]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x200\n"
"28:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 30f\n"
"29:" // Height 1: Multiply loop: Odd block loop
"ldr h0, [x26], #0x2\n"
- "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v8.8h, v16.8h, v0.h[0]\n"
"sub x27, x27, #0x1\n"
- "ldr q16, [x10, #0x10]\n"
- "fmla v8.8h, v17.8h, v0.h[0]\n"
- "ldr q17, [x10, #0x20]\n"
- "fmla v9.8h, v16.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "ldr q16, [x10, #0x20]\n"
+ "fmla v9.8h, v17.8h, v0.h[0]\n"
+ "fmla v10.8h, v16.8h, v0.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- "fmla v10.8h, v17.8h, v0.h[0]\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
+ "add x10, x10, #0x40\n"
"cbnz x27, 29b\n"
"30:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -426,9 +424,9 @@ void a64_hybrid_fp16_mla_6x32 (
"bne 23b\n"
"prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 31f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.8h }, [x21]\n"
"ld1r { v16.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v17.8h\n"
"fmin v9.8h, v9.8h, v17.8h\n"
@@ -546,167 +544,167 @@ void a64_hybrid_fp16_mla_6x32 (
"bgt 2b\n"
"b 296f\n"
"50:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"51:" // Height 2: Column loop
"cbz x12, 52f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "add x12, x12, #0x40\n"
"b 71f\n"
"52:" // Height 2: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x11, #0x20\n"
- "add x26, x9, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"bge 69f\n"
"tbz x11, #4, 60f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
"ld1 { v9.8h }, [x9], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
"tbz x11, #3, 56f\n"
"ld1 { v10.8h }, [x9], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
"tbz x11, #2, 54f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"tbz x11, #1, 53f\n"
"ld1 { v11.s }[2], [x9], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
"tbz x11, #0, 68f\n"
"ld1 { v11.h }[6], [x9]\n"
- "ld1 { v15.h }[6], [x26]\n"
+ "ld1 { v15.h }[6], [x25]\n"
"b 68f\n"
"53:" // Height 2: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x11, #0, 68f\n"
"ld1 { v11.h }[4], [x9]\n"
- "ld1 { v15.h }[4], [x26]\n"
+ "ld1 { v15.h }[4], [x25]\n"
"b 68f\n"
"54:" // Height 2: Partial accumulate: partial_2_24
"tbz x11, #1, 55f\n"
"ldr s11, [x9], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
"tbz x11, #0, 68f\n"
"ld1 { v11.h }[2], [x9]\n"
- "ld1 { v15.h }[2], [x26]\n"
+ "ld1 { v15.h }[2], [x25]\n"
"b 68f\n"
"55:" // Height 2: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x11, #0, 68f\n"
"ldr h11, [x9, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
"b 68f\n"
"56:" // Height 2: Partial accumulate: partial_4_16
"tbz x11, #2, 58f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"tbz x11, #1, 57f\n"
"ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
"tbz x11, #0, 68f\n"
"ld1 { v10.h }[6], [x9]\n"
- "ld1 { v14.h }[6], [x26]\n"
+ "ld1 { v14.h }[6], [x25]\n"
"b 68f\n"
"57:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x11, #0, 68f\n"
"ld1 { v10.h }[4], [x9]\n"
- "ld1 { v14.h }[4], [x26]\n"
+ "ld1 { v14.h }[4], [x25]\n"
"b 68f\n"
"58:" // Height 2: Partial accumulate: partial_2_16
"tbz x11, #1, 59f\n"
"ldr s10, [x9], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
"tbz x11, #0, 68f\n"
"ld1 { v10.h }[2], [x9]\n"
- "ld1 { v14.h }[2], [x26]\n"
+ "ld1 { v14.h }[2], [x25]\n"
"b 68f\n"
"59:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x11, #0, 68f\n"
"ldr h10, [x9, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
"b 68f\n"
"60:" // Height 2: Partial accumulate: partial_8_0
"tbz x11, #3, 64f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
"tbz x11, #2, 62f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"tbz x11, #1, 61f\n"
"ld1 { v9.s }[2], [x9], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
"tbz x11, #0, 68f\n"
"ld1 { v9.h }[6], [x9]\n"
- "ld1 { v13.h }[6], [x26]\n"
+ "ld1 { v13.h }[6], [x25]\n"
"b 68f\n"
"61:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x11, #0, 68f\n"
"ld1 { v9.h }[4], [x9]\n"
- "ld1 { v13.h }[4], [x26]\n"
+ "ld1 { v13.h }[4], [x25]\n"
"b 68f\n"
"62:" // Height 2: Partial accumulate: partial_2_8
"tbz x11, #1, 63f\n"
"ldr s9, [x9], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
"tbz x11, #0, 68f\n"
"ld1 { v9.h }[2], [x9]\n"
- "ld1 { v13.h }[2], [x26]\n"
+ "ld1 { v13.h }[2], [x25]\n"
"b 68f\n"
"63:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x11, #0, 68f\n"
"ldr h9, [x9, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
"b 68f\n"
"64:" // Height 2: Partial accumulate: partial_4_0
"tbz x11, #2, 66f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"tbz x11, #1, 65f\n"
"ld1 { v8.s }[2], [x9], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
"tbz x11, #0, 68f\n"
"ld1 { v8.h }[6], [x9]\n"
- "ld1 { v12.h }[6], [x26]\n"
+ "ld1 { v12.h }[6], [x25]\n"
"b 68f\n"
"65:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x11, #0, 68f\n"
"ld1 { v8.h }[4], [x9]\n"
- "ld1 { v12.h }[4], [x26]\n"
+ "ld1 { v12.h }[4], [x25]\n"
"b 68f\n"
"66:" // Height 2: Partial accumulate: partial_2_0
"tbz x11, #1, 67f\n"
"ldr s8, [x9], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
"tbz x11, #0, 68f\n"
"ld1 { v8.h }[2], [x9]\n"
- "ld1 { v12.h }[2], [x26]\n"
+ "ld1 { v12.h }[2], [x25]\n"
"b 68f\n"
"67:" // Height 2: Partial accumulate: partial_1_0
"ldr h8, [x9, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"68:" // Height 2: Partial accumulate: Done
"sub x9, x9, x20\n"
@@ -716,10 +714,10 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 71f\n"
"70:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -734,8 +732,8 @@ void a64_hybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"72:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -767,22 +765,22 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v13.8h, v7.8h, v1.h[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"fmla v14.8h, v17.8h, v1.h[0]\n"
"ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
"fmla v15.8h, v16.8h, v1.h[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "cmp x27, #0x10\n"
"fmla v8.8h, v17.8h, v0.h[1]\n"
"fmla v12.8h, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v9.8h, v16.8h, v0.h[1]\n"
"fmla v13.8h, v16.8h, v1.h[1]\n"
"ldr q16, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.8h, v17.8h, v0.h[1]\n"
"fmla v14.8h, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x80]\n"
@@ -874,18 +872,18 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v13.8h, v7.8h, v1.h[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.8h, v17.8h, v0.h[0]\n"
"fmla v14.8h, v17.8h, v1.h[0]\n"
"ldr q17, [x10, #0x40]\n"
+ "sub x27, x27, #0x8\n"
"fmla v11.8h, v16.8h, v0.h[0]\n"
"fmla v15.8h, v16.8h, v1.h[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v8.8h, v17.8h, v0.h[1]\n"
"fmla v12.8h, v17.8h, v1.h[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v9.8h, v16.8h, v0.h[1]\n"
"fmla v13.8h, v16.8h, v1.h[1]\n"
"ldr q16, [x10, #0x70]\n"
@@ -980,9 +978,9 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v9.8h, v16.8h, v1.h[0]\n"
"fmla v13.8h, v16.8h, v0.h[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
"fmla v10.8h, v17.8h, v1.h[0]\n"
"fmla v14.8h, v17.8h, v0.h[0]\n"
+ "add x10, x10, #0x40\n"
"fmla v11.8h, v16.8h, v1.h[0]\n"
"fmla v15.8h, v16.8h, v0.h[0]\n"
"cbnz x27, 78b\n"
@@ -992,13 +990,13 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 72b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #1\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.8h }, [x21]\n"
"ld1r { v16.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v17.8h\n"
"fmin v9.8h, v9.8h, v17.8h\n"
@@ -1022,127 +1020,127 @@ void a64_hybrid_fp16_mla_6x32 (
"tbz x11, #4, 88f\n"
"st1 { v8.8h }, [x9], #0x10\n"
"st1 { v9.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
"tbz x11, #3, 84f\n"
"st1 { v10.8h }, [x9], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
"tbz x11, #2, 82f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x11, #1, 81f\n"
"st1 { v11.s }[2], [x9], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v11.h }[6], [x9]\n"
- "st1 { v15.h }[6], [x26]\n"
+ "st1 { v15.h }[6], [x25]\n"
"b 96f\n"
"81:" // Height 2: Partial direct writeback: partial_1_28
"tbz x11, #0, 96f\n"
"st1 { v11.h }[4], [x9]\n"
- "st1 { v15.h }[4], [x26]\n"
+ "st1 { v15.h }[4], [x25]\n"
"b 96f\n"
"82:" // Height 2: Partial direct writeback: partial_2_24
"tbz x11, #1, 83f\n"
"str s11, [x9], #0x4\n"
- "str s15, [x26], #0x4\n"
+ "str s15, [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v11.h }[2], [x9]\n"
- "st1 { v15.h }[2], [x26]\n"
+ "st1 { v15.h }[2], [x25]\n"
"b 96f\n"
"83:" // Height 2: Partial direct writeback: partial_1_24
"tbz x11, #0, 96f\n"
"str h11, [x9, #0x0]\n"
- "str h15, [x26, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
"b 96f\n"
"84:" // Height 2: Partial direct writeback: partial_4_16
"tbz x11, #2, 86f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x11, #1, 85f\n"
"st1 { v10.s }[2], [x9], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v10.h }[6], [x9]\n"
- "st1 { v14.h }[6], [x26]\n"
+ "st1 { v14.h }[6], [x25]\n"
"b 96f\n"
"85:" // Height 2: Partial direct writeback: partial_1_20
"tbz x11, #0, 96f\n"
"st1 { v10.h }[4], [x9]\n"
- "st1 { v14.h }[4], [x26]\n"
+ "st1 { v14.h }[4], [x25]\n"
"b 96f\n"
"86:" // Height 2: Partial direct writeback: partial_2_16
"tbz x11, #1, 87f\n"
"str s10, [x9], #0x4\n"
- "str s14, [x26], #0x4\n"
+ "str s14, [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v10.h }[2], [x9]\n"
- "st1 { v14.h }[2], [x26]\n"
+ "st1 { v14.h }[2], [x25]\n"
"b 96f\n"
"87:" // Height 2: Partial direct writeback: partial_1_16
"tbz x11, #0, 96f\n"
"str h10, [x9, #0x0]\n"
- "str h14, [x26, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
"b 96f\n"
"88:" // Height 2: Partial direct writeback: partial_8_0
"tbz x11, #3, 92f\n"
"st1 { v8.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
"tbz x11, #2, 90f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x11, #1, 89f\n"
"st1 { v9.s }[2], [x9], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v9.h }[6], [x9]\n"
- "st1 { v13.h }[6], [x26]\n"
+ "st1 { v13.h }[6], [x25]\n"
"b 96f\n"
"89:" // Height 2: Partial direct writeback: partial_1_12
"tbz x11, #0, 96f\n"
"st1 { v9.h }[4], [x9]\n"
- "st1 { v13.h }[4], [x26]\n"
+ "st1 { v13.h }[4], [x25]\n"
"b 96f\n"
"90:" // Height 2: Partial direct writeback: partial_2_8
"tbz x11, #1, 91f\n"
"str s9, [x9], #0x4\n"
- "str s13, [x26], #0x4\n"
+ "str s13, [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v9.h }[2], [x9]\n"
- "st1 { v13.h }[2], [x26]\n"
+ "st1 { v13.h }[2], [x25]\n"
"b 96f\n"
"91:" // Height 2: Partial direct writeback: partial_1_8
"tbz x11, #0, 96f\n"
"str h9, [x9, #0x0]\n"
- "str h13, [x26, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
"b 96f\n"
"92:" // Height 2: Partial direct writeback: partial_4_0
"tbz x11, #2, 94f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x11, #1, 93f\n"
"st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v8.h }[6], [x9]\n"
- "st1 { v12.h }[6], [x26]\n"
+ "st1 { v12.h }[6], [x25]\n"
"b 96f\n"
"93:" // Height 2: Partial direct writeback: partial_1_4
"tbz x11, #0, 96f\n"
"st1 { v8.h }[4], [x9]\n"
- "st1 { v12.h }[4], [x26]\n"
+ "st1 { v12.h }[4], [x25]\n"
"b 96f\n"
"94:" // Height 2: Partial direct writeback: partial_2_0
"tbz x11, #1, 95f\n"
"str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
+ "str s12, [x25], #0x4\n"
"tbz x11, #0, 96f\n"
"st1 { v8.h }[2], [x9]\n"
- "st1 { v12.h }[2], [x26]\n"
+ "st1 { v12.h }[2], [x25]\n"
"b 96f\n"
"95:" // Height 2: Partial direct writeback: partial_1_0
"str h8, [x9, #0x0]\n"
- "str h12, [x26, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
"96:" // Height 2: Partial direct writeback: Done
"b 98f\n"
"97:" // Height 2: Full writeback
@@ -1151,214 +1149,214 @@ void a64_hybrid_fp16_mla_6x32 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"98:" // Height 2: Writeback done
"subs x11, x11, #0x20\n"
"bgt 51b\n"
"b 296f\n"
"99:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"100:" // Height 3: Column loop
"cbz x12, 101f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"b 120f\n"
"101:" // Height 3: no bias
"tbz %x[flags], #0, 119f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #1\n"
"cmp x11, #0x20\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"bge 118f\n"
"tbz x11, #4, 109f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
"ld1 { v9.8h }, [x9], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
"tbz x11, #3, 105f\n"
"ld1 { v10.8h }, [x9], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
"tbz x11, #2, 103f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x11, #1, 102f\n"
"ld1 { v11.s }[2], [x9], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v11.h }[6], [x9]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
"b 117f\n"
"102:" // Height 3: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x11, #0, 117f\n"
"ld1 { v11.h }[4], [x9]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
"b 117f\n"
"103:" // Height 3: Partial accumulate: partial_2_24
"tbz x11, #1, 104f\n"
"ldr s11, [x9], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v11.h }[2], [x9]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
"b 117f\n"
"104:" // Height 3: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x11, #0, 117f\n"
"ldr h11, [x9, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
"b 117f\n"
"105:" // Height 3: Partial accumulate: partial_4_16
"tbz x11, #2, 107f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x11, #1, 106f\n"
"ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v10.h }[6], [x9]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
"b 117f\n"
"106:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x11, #0, 117f\n"
"ld1 { v10.h }[4], [x9]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
"b 117f\n"
"107:" // Height 3: Partial accumulate: partial_2_16
"tbz x11, #1, 108f\n"
"ldr s10, [x9], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v10.h }[2], [x9]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
"b 117f\n"
"108:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x11, #0, 117f\n"
"ldr h10, [x9, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
"b 117f\n"
"109:" // Height 3: Partial accumulate: partial_8_0
"tbz x11, #3, 113f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
"tbz x11, #2, 111f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x11, #1, 110f\n"
"ld1 { v9.s }[2], [x9], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v9.h }[6], [x9]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
"b 117f\n"
"110:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x11, #0, 117f\n"
"ld1 { v9.h }[4], [x9]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
"b 117f\n"
"111:" // Height 3: Partial accumulate: partial_2_8
"tbz x11, #1, 112f\n"
"ldr s9, [x9], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v9.h }[2], [x9]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
"b 117f\n"
"112:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x11, #0, 117f\n"
"ldr h9, [x9, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
"b 117f\n"
"113:" // Height 3: Partial accumulate: partial_4_0
"tbz x11, #2, 115f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x11, #1, 114f\n"
"ld1 { v8.s }[2], [x9], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v8.h }[6], [x9]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
"b 117f\n"
"114:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x11, #0, 117f\n"
"ld1 { v8.h }[4], [x9]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
"b 117f\n"
"115:" // Height 3: Partial accumulate: partial_2_0
"tbz x11, #1, 116f\n"
"ldr s8, [x9], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
"tbz x11, #0, 117f\n"
"ld1 { v8.h }[2], [x9]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
"b 117f\n"
"116:" // Height 3: Partial accumulate: partial_1_0
"ldr h8, [x9, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
"117:" // Height 3: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 120f\n"
@@ -1367,14 +1365,14 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 120f\n"
"119:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -1393,8 +1391,8 @@ void a64_hybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"121:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 122f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1434,18 +1432,18 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v17.8h, v7.8h, v2.h[0]\n"
"ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "cmp x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.8h, v21.8h, v0.h[0]\n"
"fmla v14.8h, v21.8h, v1.h[0]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v18.8h, v21.8h, v2.h[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v11.8h, v20.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v15.8h, v20.8h, v1.h[0]\n"
"fmla v19.8h, v20.8h, v2.h[0]\n"
"ldr q20, [x10, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v8.8h, v21.8h, v0.h[1]\n"
"fmla v12.8h, v21.8h, v1.h[1]\n"
"fmla v16.8h, v21.8h, v2.h[1]\n"
@@ -1576,14 +1574,14 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v17.8h, v7.8h, v2.h[0]\n"
"ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.8h, v21.8h, v0.h[0]\n"
"fmla v14.8h, v21.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v18.8h, v21.8h, v2.h[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v11.8h, v20.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v15.8h, v20.8h, v1.h[0]\n"
"fmla v19.8h, v20.8h, v2.h[0]\n"
"ldr q20, [x10, #0x50]\n"
@@ -1706,9 +1704,9 @@ void a64_hybrid_fp16_mla_6x32 (
"sub x27, x27, #0x1\n"
"ldr h0, [x24], #0x2\n"
"ldr q21, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
"fmla v8.8h, v21.8h, v2.h[0]\n"
"fmla v12.8h, v21.8h, v1.h[0]\n"
+ "ldr q20, [x10, #0x10]\n"
"fmla v16.8h, v21.8h, v0.h[0]\n"
"ldr q21, [x10, #0x20]\n"
"fmla v9.8h, v20.8h, v2.h[0]\n"
@@ -1729,15 +1727,15 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 121b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #1\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 129f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v21.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.8h }, [x21]\n"
"ld1r { v20.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v21.8h\n"
"fmin v9.8h, v9.8h, v21.8h\n"
@@ -1769,159 +1767,159 @@ void a64_hybrid_fp16_mla_6x32 (
"tbz x11, #4, 137f\n"
"st1 { v8.8h }, [x9], #0x10\n"
"st1 { v9.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
"tbz x11, #3, 133f\n"
"st1 { v10.8h }, [x9], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
"tbz x11, #2, 131f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x11, #1, 130f\n"
"st1 { v11.s }[2], [x9], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v11.h }[6], [x9]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
"b 145f\n"
"130:" // Height 3: Partial direct writeback: partial_1_28
"tbz x11, #0, 145f\n"
"st1 { v11.h }[4], [x9]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
"b 145f\n"
"131:" // Height 3: Partial direct writeback: partial_2_24
"tbz x11, #1, 132f\n"
"str s11, [x9], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v11.h }[2], [x9]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
"b 145f\n"
"132:" // Height 3: Partial direct writeback: partial_1_24
"tbz x11, #0, 145f\n"
"str h11, [x9, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
"b 145f\n"
"133:" // Height 3: Partial direct writeback: partial_4_16
"tbz x11, #2, 135f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x11, #1, 134f\n"
"st1 { v10.s }[2], [x9], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v10.h }[6], [x9]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
"b 145f\n"
"134:" // Height 3: Partial direct writeback: partial_1_20
"tbz x11, #0, 145f\n"
"st1 { v10.h }[4], [x9]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
"b 145f\n"
"135:" // Height 3: Partial direct writeback: partial_2_16
"tbz x11, #1, 136f\n"
"str s10, [x9], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v10.h }[2], [x9]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
"b 145f\n"
"136:" // Height 3: Partial direct writeback: partial_1_16
"tbz x11, #0, 145f\n"
"str h10, [x9, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
"b 145f\n"
"137:" // Height 3: Partial direct writeback: partial_8_0
"tbz x11, #3, 141f\n"
"st1 { v8.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
"tbz x11, #2, 139f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x11, #1, 138f\n"
"st1 { v9.s }[2], [x9], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v9.h }[6], [x9]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
"b 145f\n"
"138:" // Height 3: Partial direct writeback: partial_1_12
"tbz x11, #0, 145f\n"
"st1 { v9.h }[4], [x9]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
"b 145f\n"
"139:" // Height 3: Partial direct writeback: partial_2_8
"tbz x11, #1, 140f\n"
"str s9, [x9], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v9.h }[2], [x9]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
"b 145f\n"
"140:" // Height 3: Partial direct writeback: partial_1_8
"tbz x11, #0, 145f\n"
"str h9, [x9, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
"b 145f\n"
"141:" // Height 3: Partial direct writeback: partial_4_0
"tbz x11, #2, 143f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x11, #1, 142f\n"
"st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v8.h }[6], [x9]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
"b 145f\n"
"142:" // Height 3: Partial direct writeback: partial_1_4
"tbz x11, #0, 145f\n"
"st1 { v8.h }[4], [x9]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
"b 145f\n"
"143:" // Height 3: Partial direct writeback: partial_2_0
"tbz x11, #1, 144f\n"
"str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
"tbz x11, #0, 145f\n"
"st1 { v8.h }[2], [x9]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
"b 145f\n"
"144:" // Height 3: Partial direct writeback: partial_1_0
"str h8, [x9, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
"145:" // Height 3: Partial direct writeback: Done
"b 147f\n"
"146:" // Height 3: Full writeback
@@ -1930,39 +1928,39 @@ void a64_hybrid_fp16_mla_6x32 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"147:" // Height 3: Writeback done
"subs x11, x11, #0x20\n"
"bgt 100b\n"
"b 296f\n"
"148:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"149:" // Height 4: Column loop
"cbz x12, 150f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1970,215 +1968,215 @@ void a64_hybrid_fp16_mla_6x32 (
"150:" // Height 4: no bias
"tbz %x[flags], #0, 168f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x20\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "cmp x11, #0x20\n"
+ "add x23, x24, x20, LSL #1\n"
"bge 167f\n"
"tbz x11, #4, 158f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
"ld1 { v9.8h }, [x9], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
"tbz x11, #3, 154f\n"
"ld1 { v10.8h }, [x9], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
"tbz x11, #2, 152f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x11, #1, 151f\n"
"ld1 { v11.s }[2], [x9], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v11.h }[6], [x9]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
"b 166f\n"
"151:" // Height 4: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x11, #0, 166f\n"
"ld1 { v11.h }[4], [x9]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
"b 166f\n"
"152:" // Height 4: Partial accumulate: partial_2_24
"tbz x11, #1, 153f\n"
"ldr s11, [x9], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v11.h }[2], [x9]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
"b 166f\n"
"153:" // Height 4: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x11, #0, 166f\n"
"ldr h11, [x9, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
"b 166f\n"
"154:" // Height 4: Partial accumulate: partial_4_16
"tbz x11, #2, 156f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x11, #1, 155f\n"
"ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v10.h }[6], [x9]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
"b 166f\n"
"155:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x11, #0, 166f\n"
"ld1 { v10.h }[4], [x9]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
"b 166f\n"
"156:" // Height 4: Partial accumulate: partial_2_16
"tbz x11, #1, 157f\n"
"ldr s10, [x9], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v10.h }[2], [x9]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
"b 166f\n"
"157:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x11, #0, 166f\n"
"ldr h10, [x9, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
"b 166f\n"
"158:" // Height 4: Partial accumulate: partial_8_0
"tbz x11, #3, 162f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
"tbz x11, #2, 160f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x11, #1, 159f\n"
"ld1 { v9.s }[2], [x9], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v9.h }[6], [x9]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
"b 166f\n"
"159:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x11, #0, 166f\n"
"ld1 { v9.h }[4], [x9]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
"b 166f\n"
"160:" // Height 4: Partial accumulate: partial_2_8
"tbz x11, #1, 161f\n"
"ldr s9, [x9], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v9.h }[2], [x9]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
"b 166f\n"
"161:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x11, #0, 166f\n"
"ldr h9, [x9, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
"b 166f\n"
"162:" // Height 4: Partial accumulate: partial_4_0
"tbz x11, #2, 164f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x11, #1, 163f\n"
"ld1 { v8.s }[2], [x9], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v8.h }[6], [x9]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
"b 166f\n"
"163:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x11, #0, 166f\n"
"ld1 { v8.h }[4], [x9]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
"b 166f\n"
"164:" // Height 4: Partial accumulate: partial_2_0
"tbz x11, #1, 165f\n"
"ldr s8, [x9], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
"tbz x11, #0, 166f\n"
"ld1 { v8.h }[2], [x9]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
"b 166f\n"
"165:" // Height 4: Partial accumulate: partial_1_0
"ldr h8, [x9, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
"166:" // Height 4: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 169f\n"
@@ -2187,18 +2185,18 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 169f\n"
"168:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -2221,8 +2219,8 @@ void a64_hybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"170:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 171f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2442,14 +2440,14 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.8h, v25.8h, v0.h[0]\n"
"fmla v14.8h, v25.8h, v1.h[0]\n"
- "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"fmla v18.8h, v25.8h, v2.h[0]\n"
"fmla v22.8h, v25.8h, v3.h[0]\n"
@@ -2635,17 +2633,17 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 170b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x20, LSL #1\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 178f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.8h }, [x21]\n"
"ld1r { v24.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v25.8h\n"
"fmin v9.8h, v9.8h, v25.8h\n"
@@ -2685,191 +2683,191 @@ void a64_hybrid_fp16_mla_6x32 (
"tbz x11, #4, 186f\n"
"st1 { v8.8h }, [x9], #0x10\n"
"st1 { v9.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
"tbz x11, #3, 182f\n"
"st1 { v10.8h }, [x9], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
"tbz x11, #2, 180f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x11, #1, 179f\n"
"st1 { v11.s }[2], [x9], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v11.h }[6], [x9]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
"b 194f\n"
"179:" // Height 4: Partial direct writeback: partial_1_28
"tbz x11, #0, 194f\n"
"st1 { v11.h }[4], [x9]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
"b 194f\n"
"180:" // Height 4: Partial direct writeback: partial_2_24
"tbz x11, #1, 181f\n"
"str s11, [x9], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v11.h }[2], [x9]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
"b 194f\n"
"181:" // Height 4: Partial direct writeback: partial_1_24
"tbz x11, #0, 194f\n"
"str h11, [x9, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
"b 194f\n"
"182:" // Height 4: Partial direct writeback: partial_4_16
"tbz x11, #2, 184f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x11, #1, 183f\n"
"st1 { v10.s }[2], [x9], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v10.h }[6], [x9]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
"b 194f\n"
"183:" // Height 4: Partial direct writeback: partial_1_20
"tbz x11, #0, 194f\n"
"st1 { v10.h }[4], [x9]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
"b 194f\n"
"184:" // Height 4: Partial direct writeback: partial_2_16
"tbz x11, #1, 185f\n"
"str s10, [x9], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v10.h }[2], [x9]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
"b 194f\n"
"185:" // Height 4: Partial direct writeback: partial_1_16
"tbz x11, #0, 194f\n"
"str h10, [x9, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
"b 194f\n"
"186:" // Height 4: Partial direct writeback: partial_8_0
"tbz x11, #3, 190f\n"
"st1 { v8.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
"tbz x11, #2, 188f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x11, #1, 187f\n"
"st1 { v9.s }[2], [x9], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v9.h }[6], [x9]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
"b 194f\n"
"187:" // Height 4: Partial direct writeback: partial_1_12
"tbz x11, #0, 194f\n"
"st1 { v9.h }[4], [x9]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
"b 194f\n"
"188:" // Height 4: Partial direct writeback: partial_2_8
"tbz x11, #1, 189f\n"
"str s9, [x9], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v9.h }[2], [x9]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
"b 194f\n"
"189:" // Height 4: Partial direct writeback: partial_1_8
"tbz x11, #0, 194f\n"
"str h9, [x9, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
"b 194f\n"
"190:" // Height 4: Partial direct writeback: partial_4_0
"tbz x11, #2, 192f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x11, #1, 191f\n"
"st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v8.h }[6], [x9]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
"b 194f\n"
"191:" // Height 4: Partial direct writeback: partial_1_4
"tbz x11, #0, 194f\n"
"st1 { v8.h }[4], [x9]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
"b 194f\n"
"192:" // Height 4: Partial direct writeback: partial_2_0
"tbz x11, #1, 193f\n"
"str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x11, #0, 194f\n"
"st1 { v8.h }[2], [x9]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
"b 194f\n"
"193:" // Height 4: Partial direct writeback: partial_1_0
"str h8, [x9, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
"194:" // Height 4: Partial direct writeback: Done
"b 196f\n"
"195:" // Height 4: Full writeback
@@ -2878,43 +2876,43 @@ void a64_hybrid_fp16_mla_6x32 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"196:" // Height 4: Writeback done
"subs x11, x11, #0x20\n"
"bgt 149b\n"
"b 296f\n"
"197:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"198:" // Height 5: Column loop
"cbz x12, 199f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -2926,248 +2924,248 @@ void a64_hybrid_fp16_mla_6x32 (
"199:" // Height 5: no bias
"tbz %x[flags], #0, 217f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x20\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "cmp x11, #0x20\n"
+ "add x22, x23, x20, LSL #1\n"
"bge 216f\n"
"tbz x11, #4, 207f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
"ld1 { v9.8h }, [x9], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
"tbz x11, #3, 203f\n"
"ld1 { v10.8h }, [x9], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
"tbz x11, #2, 201f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x11, #1, 200f\n"
"ld1 { v11.s }[2], [x9], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v11.h }[6], [x9]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
"b 215f\n"
"200:" // Height 5: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x11, #0, 215f\n"
"ld1 { v11.h }[4], [x9]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
"b 215f\n"
"201:" // Height 5: Partial accumulate: partial_2_24
"tbz x11, #1, 202f\n"
"ldr s11, [x9], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v11.h }[2], [x9]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
"b 215f\n"
"202:" // Height 5: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x11, #0, 215f\n"
"ldr h11, [x9, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
"b 215f\n"
"203:" // Height 5: Partial accumulate: partial_4_16
"tbz x11, #2, 205f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x11, #1, 204f\n"
"ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v10.h }[6], [x9]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
"b 215f\n"
"204:" // Height 5: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x11, #0, 215f\n"
"ld1 { v10.h }[4], [x9]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
"b 215f\n"
"205:" // Height 5: Partial accumulate: partial_2_16
"tbz x11, #1, 206f\n"
"ldr s10, [x9], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v10.h }[2], [x9]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
"b 215f\n"
"206:" // Height 5: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x11, #0, 215f\n"
"ldr h10, [x9, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
"b 215f\n"
"207:" // Height 5: Partial accumulate: partial_8_0
"tbz x11, #3, 211f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
"tbz x11, #2, 209f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x11, #1, 208f\n"
"ld1 { v9.s }[2], [x9], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v9.h }[6], [x9]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
"b 215f\n"
"208:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x11, #0, 215f\n"
"ld1 { v9.h }[4], [x9]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
"b 215f\n"
"209:" // Height 5: Partial accumulate: partial_2_8
"tbz x11, #1, 210f\n"
"ldr s9, [x9], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v9.h }[2], [x9]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
"b 215f\n"
"210:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x11, #0, 215f\n"
"ldr h9, [x9, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
"b 215f\n"
"211:" // Height 5: Partial accumulate: partial_4_0
"tbz x11, #2, 213f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x11, #1, 212f\n"
"ld1 { v8.s }[2], [x9], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v8.h }[6], [x9]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
"b 215f\n"
"212:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x11, #0, 215f\n"
"ld1 { v8.h }[4], [x9]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
"b 215f\n"
"213:" // Height 5: Partial accumulate: partial_2_0
"tbz x11, #1, 214f\n"
"ldr s8, [x9], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
"tbz x11, #0, 215f\n"
"ld1 { v8.h }[2], [x9]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
"b 215f\n"
"214:" // Height 5: Partial accumulate: partial_1_0
"ldr h8, [x9, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
"215:" // Height 5: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 218f\n"
@@ -3176,22 +3174,22 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 218f\n"
"217:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -3218,8 +3216,8 @@ void a64_hybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"219:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 220f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3481,12 +3479,12 @@ void a64_hybrid_fp16_mla_6x32 (
"add x22, x22, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
+ "sub x27, x27, #0x8\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.8h, v29.8h, v0.h[0]\n"
"fmla v14.8h, v29.8h, v1.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
@@ -3679,9 +3677,9 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr h1, [x23], #0x2\n"
"ldr h0, [x22], #0x2\n"
"ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
"fmla v8.8h, v29.8h, v4.h[0]\n"
"fmla v12.8h, v29.8h, v3.h[0]\n"
+ "ldr q28, [x10, #0x10]\n"
"fmla v16.8h, v29.8h, v2.h[0]\n"
"fmla v20.8h, v29.8h, v1.h[0]\n"
"fmla v24.8h, v29.8h, v0.h[0]\n"
@@ -3710,19 +3708,19 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 219b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x20, LSL #1\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #1\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 227f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v29.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.8h }, [x21]\n"
"ld1r { v28.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v29.8h\n"
"fmin v9.8h, v9.8h, v29.8h\n"
@@ -3770,223 +3768,223 @@ void a64_hybrid_fp16_mla_6x32 (
"tbz x11, #4, 235f\n"
"st1 { v8.8h }, [x9], #0x10\n"
"st1 { v9.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
"tbz x11, #3, 231f\n"
"st1 { v10.8h }, [x9], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
"tbz x11, #2, 229f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x11, #1, 228f\n"
"st1 { v11.s }[2], [x9], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v11.h }[6], [x9]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
- "st1 { v27.h }[6], [x23]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
"b 243f\n"
"228:" // Height 5: Partial direct writeback: partial_1_28
"tbz x11, #0, 243f\n"
"st1 { v11.h }[4], [x9]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
- "st1 { v27.h }[4], [x23]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
"b 243f\n"
"229:" // Height 5: Partial direct writeback: partial_2_24
"tbz x11, #1, 230f\n"
"str s11, [x9], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
- "str s27, [x23], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v11.h }[2], [x9]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
- "st1 { v27.h }[2], [x23]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
"b 243f\n"
"230:" // Height 5: Partial direct writeback: partial_1_24
"tbz x11, #0, 243f\n"
"str h11, [x9, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
- "str h27, [x23, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
"b 243f\n"
"231:" // Height 5: Partial direct writeback: partial_4_16
"tbz x11, #2, 233f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x11, #1, 232f\n"
"st1 { v10.s }[2], [x9], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v10.h }[6], [x9]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
- "st1 { v26.h }[6], [x23]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
"b 243f\n"
"232:" // Height 5: Partial direct writeback: partial_1_20
"tbz x11, #0, 243f\n"
"st1 { v10.h }[4], [x9]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
- "st1 { v26.h }[4], [x23]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
"b 243f\n"
"233:" // Height 5: Partial direct writeback: partial_2_16
"tbz x11, #1, 234f\n"
"str s10, [x9], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
- "str s26, [x23], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v10.h }[2], [x9]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
- "st1 { v26.h }[2], [x23]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
"b 243f\n"
"234:" // Height 5: Partial direct writeback: partial_1_16
"tbz x11, #0, 243f\n"
"str h10, [x9, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
- "str h26, [x23, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
"b 243f\n"
"235:" // Height 5: Partial direct writeback: partial_8_0
"tbz x11, #3, 239f\n"
"st1 { v8.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
"tbz x11, #2, 237f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x11, #1, 236f\n"
"st1 { v9.s }[2], [x9], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v9.h }[6], [x9]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
- "st1 { v25.h }[6], [x23]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
"b 243f\n"
"236:" // Height 5: Partial direct writeback: partial_1_12
"tbz x11, #0, 243f\n"
"st1 { v9.h }[4], [x9]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
- "st1 { v25.h }[4], [x23]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
"b 243f\n"
"237:" // Height 5: Partial direct writeback: partial_2_8
"tbz x11, #1, 238f\n"
"str s9, [x9], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
- "str s25, [x23], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v9.h }[2], [x9]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
- "st1 { v25.h }[2], [x23]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
"b 243f\n"
"238:" // Height 5: Partial direct writeback: partial_1_8
"tbz x11, #0, 243f\n"
"str h9, [x9, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
- "str h25, [x23, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
"b 243f\n"
"239:" // Height 5: Partial direct writeback: partial_4_0
"tbz x11, #2, 241f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x11, #1, 240f\n"
"st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v8.h }[6], [x9]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
- "st1 { v24.h }[6], [x23]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
"b 243f\n"
"240:" // Height 5: Partial direct writeback: partial_1_4
"tbz x11, #0, 243f\n"
"st1 { v8.h }[4], [x9]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
- "st1 { v24.h }[4], [x23]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
"b 243f\n"
"241:" // Height 5: Partial direct writeback: partial_2_0
"tbz x11, #1, 242f\n"
"str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x11, #0, 243f\n"
"st1 { v8.h }[2], [x9]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
- "st1 { v24.h }[2], [x23]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
"b 243f\n"
"242:" // Height 5: Partial direct writeback: partial_1_0
"str h8, [x9, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
- "str h24, [x23, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
"243:" // Height 5: Partial direct writeback: Done
"b 245f\n"
"244:" // Height 5: Full writeback
@@ -3995,51 +3993,50 @@ void a64_hybrid_fp16_mla_6x32 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"245:" // Height 5: Writeback done
"subs x11, x11, #0x20\n"
"bgt 198b\n"
"b 296f\n"
"246:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0xc\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"247:" // Height 6: Column loop
"cbz x12, 248f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -4055,281 +4052,281 @@ void a64_hybrid_fp16_mla_6x32 (
"248:" // Height 6: no bias
"tbz %x[flags], #0, 266f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x20\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "cmp x11, #0x20\n"
+ "add x21, x22, x20, LSL #1\n"
"bge 265f\n"
"tbz x11, #4, 256f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x22], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
"ld1 { v9.8h }, [x9], #0x10\n"
- "ld1 { v13.8h }, [x26], #0x10\n"
- "ld1 { v17.8h }, [x25], #0x10\n"
- "ld1 { v21.8h }, [x24], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
- "ld1 { v29.8h }, [x22], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
+ "ld1 { v29.8h }, [x21], #0x10\n"
"tbz x11, #3, 252f\n"
"ld1 { v10.8h }, [x9], #0x10\n"
- "ld1 { v14.8h }, [x26], #0x10\n"
- "ld1 { v18.8h }, [x25], #0x10\n"
- "ld1 { v22.8h }, [x24], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
- "ld1 { v30.8h }, [x22], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
+ "ld1 { v30.8h }, [x21], #0x10\n"
"tbz x11, #2, 250f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x11, #1, 249f\n"
"ld1 { v11.s }[2], [x9], #0x4\n"
- "ld1 { v15.s }[2], [x26], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
"mov x20, #0x3c\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v11.h }[6], [x9]\n"
- "ld1 { v15.h }[6], [x26]\n"
- "ld1 { v19.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "ld1 { v31.h }[6], [x22]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
+ "ld1 { v31.h }[6], [x21]\n"
"b 264f\n"
"249:" // Height 6: Partial accumulate: partial_1_28
"mov x20, #0x38\n"
"tbz x11, #0, 264f\n"
"ld1 { v11.h }[4], [x9]\n"
- "ld1 { v15.h }[4], [x26]\n"
- "ld1 { v19.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "ld1 { v31.h }[4], [x22]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
+ "ld1 { v31.h }[4], [x21]\n"
"b 264f\n"
"250:" // Height 6: Partial accumulate: partial_2_24
"tbz x11, #1, 251f\n"
"ldr s11, [x9], #0x4\n"
- "ldr s15, [x26], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
"mov x20, #0x34\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v11.h }[2], [x9]\n"
- "ld1 { v15.h }[2], [x26]\n"
- "ld1 { v19.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "ld1 { v31.h }[2], [x22]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
+ "ld1 { v31.h }[2], [x21]\n"
"b 264f\n"
"251:" // Height 6: Partial accumulate: partial_1_24
"mov x20, #0x30\n"
"tbz x11, #0, 264f\n"
"ldr h11, [x9, #0x0]\n"
- "ldr h15, [x26, #0x0]\n"
- "ldr h19, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "ldr h31, [x22, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
+ "ldr h31, [x21, #0x0]\n"
"b 264f\n"
"252:" // Height 6: Partial accumulate: partial_4_16
"tbz x11, #2, 254f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x11, #1, 253f\n"
"ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v14.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
"mov x20, #0x2c\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v30.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
+ "ld1 { v30.s }[2], [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v10.h }[6], [x9]\n"
- "ld1 { v14.h }[6], [x26]\n"
- "ld1 { v18.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
- "ld1 { v30.h }[6], [x22]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
+ "ld1 { v30.h }[6], [x21]\n"
"b 264f\n"
"253:" // Height 6: Partial accumulate: partial_1_20
"mov x20, #0x28\n"
"tbz x11, #0, 264f\n"
"ld1 { v10.h }[4], [x9]\n"
- "ld1 { v14.h }[4], [x26]\n"
- "ld1 { v18.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
- "ld1 { v30.h }[4], [x22]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
+ "ld1 { v30.h }[4], [x21]\n"
"b 264f\n"
"254:" // Height 6: Partial accumulate: partial_2_16
"tbz x11, #1, 255f\n"
"ldr s10, [x9], #0x4\n"
- "ldr s14, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
"mov x20, #0x24\n"
- "ldr s18, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s30, [x22], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "ldr s30, [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v10.h }[2], [x9]\n"
- "ld1 { v14.h }[2], [x26]\n"
- "ld1 { v18.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
- "ld1 { v30.h }[2], [x22]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "ld1 { v30.h }[2], [x21]\n"
"b 264f\n"
"255:" // Height 6: Partial accumulate: partial_1_16
"mov x20, #0x20\n"
"tbz x11, #0, 264f\n"
"ldr h10, [x9, #0x0]\n"
- "ldr h14, [x26, #0x0]\n"
- "ldr h18, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
- "ldr h30, [x22, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "ldr h30, [x21, #0x0]\n"
"b 264f\n"
"256:" // Height 6: Partial accumulate: partial_8_0
"tbz x11, #3, 260f\n"
"ld1 { v8.8h }, [x9], #0x10\n"
- "ld1 { v12.8h }, [x26], #0x10\n"
- "ld1 { v16.8h }, [x25], #0x10\n"
- "ld1 { v20.8h }, [x24], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x22], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
"tbz x11, #2, 258f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x11, #1, 257f\n"
"ld1 { v9.s }[2], [x9], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
"mov x20, #0x1c\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
- "ld1 { v29.s }[2], [x22], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v29.s }[2], [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v9.h }[6], [x9]\n"
- "ld1 { v13.h }[6], [x26]\n"
- "ld1 { v17.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
- "ld1 { v29.h }[6], [x22]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v29.h }[6], [x21]\n"
"b 264f\n"
"257:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x18\n"
"tbz x11, #0, 264f\n"
"ld1 { v9.h }[4], [x9]\n"
- "ld1 { v13.h }[4], [x26]\n"
- "ld1 { v17.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
- "ld1 { v29.h }[4], [x22]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v29.h }[4], [x21]\n"
"b 264f\n"
"258:" // Height 6: Partial accumulate: partial_2_8
"tbz x11, #1, 259f\n"
"ldr s9, [x9], #0x4\n"
- "ldr s13, [x26], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
"mov x20, #0x14\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s29, [x22], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s29, [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v9.h }[2], [x9]\n"
- "ld1 { v13.h }[2], [x26]\n"
- "ld1 { v17.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v29.h }[2], [x22]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v29.h }[2], [x21]\n"
"b 264f\n"
"259:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x10\n"
"tbz x11, #0, 264f\n"
"ldr h9, [x9, #0x0]\n"
- "ldr h13, [x26, #0x0]\n"
- "ldr h17, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h29, [x22, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h29, [x21, #0x0]\n"
"b 264f\n"
"260:" // Height 6: Partial accumulate: partial_4_0
"tbz x11, #2, 262f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x11, #1, 261f\n"
"ld1 { v8.s }[2], [x9], #0x4\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
"mov x20, #0xc\n"
- "ld1 { v16.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v8.h }[6], [x9]\n"
- "ld1 { v12.h }[6], [x26]\n"
- "ld1 { v16.h }[6], [x25]\n"
- "ld1 { v20.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v28.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v28.h }[6], [x21]\n"
"b 264f\n"
"261:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x8\n"
"tbz x11, #0, 264f\n"
"ld1 { v8.h }[4], [x9]\n"
- "ld1 { v12.h }[4], [x26]\n"
- "ld1 { v16.h }[4], [x25]\n"
- "ld1 { v20.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v28.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v28.h }[4], [x21]\n"
"b 264f\n"
"262:" // Height 6: Partial accumulate: partial_2_0
"tbz x11, #1, 263f\n"
"ldr s8, [x9], #0x4\n"
- "ldr s12, [x26], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
"mov x20, #0x4\n"
- "ldr s16, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
"tbz x11, #0, 264f\n"
"ld1 { v8.h }[2], [x9]\n"
- "ld1 { v12.h }[2], [x26]\n"
- "ld1 { v16.h }[2], [x25]\n"
- "ld1 { v20.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v28.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v28.h }[2], [x21]\n"
"b 264f\n"
"263:" // Height 6: Partial accumulate: partial_1_0
"ldr h8, [x9, #0x0]\n"
- "ldr h12, [x26, #0x0]\n"
+ "ldr h12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr h16, [x25, #0x0]\n"
- "ldr h20, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h28, [x22, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h28, [x21, #0x0]\n"
"264:" // Height 6: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 267f\n"
@@ -4338,26 +4335,26 @@ void a64_hybrid_fp16_mla_6x32 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 267f\n"
"266:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -4388,8 +4385,8 @@ void a64_hybrid_fp16_mla_6x32 (
"mov x28, #0x0\n"
"268:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 269f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -4691,18 +4688,18 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
"ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "sub x27, x27, #0x8\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
@@ -4958,21 +4955,21 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp x28, x20\n"
"bne 268b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #1\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #1\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x21, x22, x20, LSL #1\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 276f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.8h }, [x21]\n"
"ld1r { v0.8h }, [x20]\n"
"fmin v8.8h, v8.8h, v1.8h\n"
"fmin v9.8h, v9.8h, v1.8h\n"
@@ -5028,255 +5025,255 @@ void a64_hybrid_fp16_mla_6x32 (
"tbz x11, #4, 284f\n"
"st1 { v8.8h }, [x9], #0x10\n"
"st1 { v9.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v13.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v17.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v21.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x22], #0x10\n"
- "st1 { v29.8h }, [x22], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "st1 { v29.8h }, [x21], #0x10\n"
"tbz x11, #3, 280f\n"
"st1 { v10.8h }, [x9], #0x10\n"
- "st1 { v14.8h }, [x26], #0x10\n"
- "st1 { v18.8h }, [x25], #0x10\n"
- "st1 { v22.8h }, [x24], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
- "st1 { v30.8h }, [x22], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
+ "st1 { v30.8h }, [x21], #0x10\n"
"tbz x11, #2, 278f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x11, #1, 277f\n"
"st1 { v11.s }[2], [x9], #0x4\n"
- "st1 { v15.s }[2], [x26], #0x4\n"
- "st1 { v19.s }[2], [x25], #0x4\n"
- "st1 { v23.s }[2], [x24], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
- "st1 { v31.s }[2], [x22], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
+ "st1 { v31.s }[2], [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v11.h }[6], [x9]\n"
- "st1 { v15.h }[6], [x26]\n"
- "st1 { v19.h }[6], [x25]\n"
- "st1 { v23.h }[6], [x24]\n"
- "st1 { v27.h }[6], [x23]\n"
- "st1 { v31.h }[6], [x22]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
+ "st1 { v31.h }[6], [x21]\n"
"b 292f\n"
"277:" // Height 6: Partial direct writeback: partial_1_28
"tbz x11, #0, 292f\n"
"st1 { v11.h }[4], [x9]\n"
- "st1 { v15.h }[4], [x26]\n"
- "st1 { v19.h }[4], [x25]\n"
- "st1 { v23.h }[4], [x24]\n"
- "st1 { v27.h }[4], [x23]\n"
- "st1 { v31.h }[4], [x22]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
+ "st1 { v31.h }[4], [x21]\n"
"b 292f\n"
"278:" // Height 6: Partial direct writeback: partial_2_24
"tbz x11, #1, 279f\n"
"str s11, [x9], #0x4\n"
- "str s15, [x26], #0x4\n"
- "str s19, [x25], #0x4\n"
- "str s23, [x24], #0x4\n"
- "str s27, [x23], #0x4\n"
- "str s31, [x22], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
+ "str s31, [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v11.h }[2], [x9]\n"
- "st1 { v15.h }[2], [x26]\n"
- "st1 { v19.h }[2], [x25]\n"
- "st1 { v23.h }[2], [x24]\n"
- "st1 { v27.h }[2], [x23]\n"
- "st1 { v31.h }[2], [x22]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "st1 { v31.h }[2], [x21]\n"
"b 292f\n"
"279:" // Height 6: Partial direct writeback: partial_1_24
"tbz x11, #0, 292f\n"
"str h11, [x9, #0x0]\n"
- "str h15, [x26, #0x0]\n"
- "str h19, [x25, #0x0]\n"
- "str h23, [x24, #0x0]\n"
- "str h27, [x23, #0x0]\n"
- "str h31, [x22, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
+ "str h31, [x21, #0x0]\n"
"b 292f\n"
"280:" // Height 6: Partial direct writeback: partial_4_16
"tbz x11, #2, 282f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x11, #1, 281f\n"
"st1 { v10.s }[2], [x9], #0x4\n"
- "st1 { v14.s }[2], [x26], #0x4\n"
- "st1 { v18.s }[2], [x25], #0x4\n"
- "st1 { v22.s }[2], [x24], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
- "st1 { v30.s }[2], [x22], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v10.h }[6], [x9]\n"
- "st1 { v14.h }[6], [x26]\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v22.h }[6], [x24]\n"
- "st1 { v26.h }[6], [x23]\n"
- "st1 { v30.h }[6], [x22]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
"b 292f\n"
"281:" // Height 6: Partial direct writeback: partial_1_20
"tbz x11, #0, 292f\n"
"st1 { v10.h }[4], [x9]\n"
- "st1 { v14.h }[4], [x26]\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v22.h }[4], [x24]\n"
- "st1 { v26.h }[4], [x23]\n"
- "st1 { v30.h }[4], [x22]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
"b 292f\n"
"282:" // Height 6: Partial direct writeback: partial_2_16
"tbz x11, #1, 283f\n"
"str s10, [x9], #0x4\n"
- "str s14, [x26], #0x4\n"
- "str s18, [x25], #0x4\n"
- "str s22, [x24], #0x4\n"
- "str s26, [x23], #0x4\n"
- "str s30, [x22], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
+ "str s30, [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v10.h }[2], [x9]\n"
- "st1 { v14.h }[2], [x26]\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v22.h }[2], [x24]\n"
- "st1 { v26.h }[2], [x23]\n"
- "st1 { v30.h }[2], [x22]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
"b 292f\n"
"283:" // Height 6: Partial direct writeback: partial_1_16
"tbz x11, #0, 292f\n"
"str h10, [x9, #0x0]\n"
- "str h14, [x26, #0x0]\n"
- "str h18, [x25, #0x0]\n"
- "str h22, [x24, #0x0]\n"
- "str h26, [x23, #0x0]\n"
- "str h30, [x22, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
+ "str h30, [x21, #0x0]\n"
"b 292f\n"
"284:" // Height 6: Partial direct writeback: partial_8_0
"tbz x11, #3, 288f\n"
"st1 { v8.8h }, [x9], #0x10\n"
- "st1 { v12.8h }, [x26], #0x10\n"
- "st1 { v16.8h }, [x25], #0x10\n"
- "st1 { v20.8h }, [x24], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x22], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
"tbz x11, #2, 286f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x11, #1, 285f\n"
"st1 { v9.s }[2], [x9], #0x4\n"
- "st1 { v13.s }[2], [x26], #0x4\n"
- "st1 { v17.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
- "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
+ "st1 { v29.s }[2], [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v9.h }[6], [x9]\n"
- "st1 { v13.h }[6], [x26]\n"
- "st1 { v17.h }[6], [x25]\n"
- "st1 { v21.h }[6], [x24]\n"
- "st1 { v25.h }[6], [x23]\n"
- "st1 { v29.h }[6], [x22]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "st1 { v29.h }[6], [x21]\n"
"b 292f\n"
"285:" // Height 6: Partial direct writeback: partial_1_12
"tbz x11, #0, 292f\n"
"st1 { v9.h }[4], [x9]\n"
- "st1 { v13.h }[4], [x26]\n"
- "st1 { v17.h }[4], [x25]\n"
- "st1 { v21.h }[4], [x24]\n"
- "st1 { v25.h }[4], [x23]\n"
- "st1 { v29.h }[4], [x22]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "st1 { v29.h }[4], [x21]\n"
"b 292f\n"
"286:" // Height 6: Partial direct writeback: partial_2_8
"tbz x11, #1, 287f\n"
"str s9, [x9], #0x4\n"
- "str s13, [x26], #0x4\n"
- "str s17, [x25], #0x4\n"
- "str s21, [x24], #0x4\n"
- "str s25, [x23], #0x4\n"
- "str s29, [x22], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
+ "str s29, [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v9.h }[2], [x9]\n"
- "st1 { v13.h }[2], [x26]\n"
- "st1 { v17.h }[2], [x25]\n"
- "st1 { v21.h }[2], [x24]\n"
- "st1 { v25.h }[2], [x23]\n"
- "st1 { v29.h }[2], [x22]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "st1 { v29.h }[2], [x21]\n"
"b 292f\n"
"287:" // Height 6: Partial direct writeback: partial_1_8
"tbz x11, #0, 292f\n"
"str h9, [x9, #0x0]\n"
- "str h13, [x26, #0x0]\n"
- "str h17, [x25, #0x0]\n"
- "str h21, [x24, #0x0]\n"
- "str h25, [x23, #0x0]\n"
- "str h29, [x22, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
+ "str h29, [x21, #0x0]\n"
"b 292f\n"
"288:" // Height 6: Partial direct writeback: partial_4_0
"tbz x11, #2, 290f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x11, #1, 289f\n"
"st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v8.h }[6], [x9]\n"
- "st1 { v12.h }[6], [x26]\n"
- "st1 { v16.h }[6], [x25]\n"
- "st1 { v20.h }[6], [x24]\n"
- "st1 { v24.h }[6], [x23]\n"
- "st1 { v28.h }[6], [x22]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
+ "st1 { v28.h }[6], [x21]\n"
"b 292f\n"
"289:" // Height 6: Partial direct writeback: partial_1_4
"tbz x11, #0, 292f\n"
"st1 { v8.h }[4], [x9]\n"
- "st1 { v12.h }[4], [x26]\n"
- "st1 { v16.h }[4], [x25]\n"
- "st1 { v20.h }[4], [x24]\n"
- "st1 { v24.h }[4], [x23]\n"
- "st1 { v28.h }[4], [x22]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
+ "st1 { v28.h }[4], [x21]\n"
"b 292f\n"
"290:" // Height 6: Partial direct writeback: partial_2_0
"tbz x11, #1, 291f\n"
"str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x11, #0, 292f\n"
"st1 { v8.h }[2], [x9]\n"
- "st1 { v12.h }[2], [x26]\n"
- "st1 { v16.h }[2], [x25]\n"
- "st1 { v20.h }[2], [x24]\n"
- "st1 { v24.h }[2], [x23]\n"
- "st1 { v28.h }[2], [x22]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
+ "st1 { v28.h }[2], [x21]\n"
"b 292f\n"
"291:" // Height 6: Partial direct writeback: partial_1_0
"str h8, [x9, #0x0]\n"
- "str h12, [x26, #0x0]\n"
- "str h16, [x25, #0x0]\n"
- "str h20, [x24, #0x0]\n"
- "str h24, [x23, #0x0]\n"
- "str h28, [x22, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
+ "str h28, [x21, #0x0]\n"
"292:" // Height 6: Partial direct writeback: Done
"b 294f\n"
"293:" // Height 6: Full writeback
@@ -5285,26 +5282,26 @@ void a64_hybrid_fp16_mla_6x32 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"294:" // Height 6: Writeback done
"subs x11, x11, #0x20\n"
"bgt 247b\n"
@@ -5320,8 +5317,8 @@ void a64_hybrid_fp16_mla_6x32 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"296:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
index 7a025a5deb..bce4de74f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
@@ -71,8 +71,7 @@ public:
return true;
}
- StdTransformsFixedTRB<lhs_operand_type, rhs_operand_type, result_type, 4, 24, 1> transforms = {};
-
+ StdTransformsFixedTRB<rhs_operand_type, result_type, 4, 24, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
index 23587e6317..52c4e1be65 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp32_mla_4x24_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp32_mla_4x24_a55 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -100,10 +98,10 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"cmp %x[M], #0x2\n"
"bgt 83f\n"
"beq 42f\n"
- "ldr x17, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x17, %x[bias]\n"
"ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x17, 3f\n"
"ldr q8, [x17, #0x0]\n"
@@ -223,8 +221,8 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"mov x13, #0x0\n"
"19:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -256,50 +254,47 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v11.4s, v7.4s, v0.s[0]\n"
"ldr d16, [x15, #0x70]\n"
"mov v19.d[1], x20\n"
- "ldr x22, [x15, #0x58]\n"
- "add x11, x11, #0x10\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v18.d[1], x20\n"
"ldr x20, [x15, #0x68]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x78]\n"
+ "mov v16.d[1], x20\n"
"fmla v12.4s, v19.4s, v0.s[0]\n"
"ldr d19, [x15, #0x80]\n"
- "sub x12, x12, #0x4\n"
- "ldr x21, [x15, #0x78]\n"
- "mov v18.d[1], x22\n"
- "mov v17.d[1], x20\n"
"ldr x20, [x15, #0x88]\n"
"fmla v13.4s, v18.4s, v0.s[0]\n"
"ldr d18, [x15, #0x90]\n"
- "mov v16.d[1], x21\n"
- "ldr x22, [x15, #0x98]\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"ldr d17, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xa8]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
"ldr d16, [x15, #0xb0]\n"
"mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x15, #0xb8]\n"
- "mov v18.d[1], x22\n"
- "mov v17.d[1], x21\n"
+ "mov v16.d[1], x20\n"
"fmla v10.4s, v19.4s, v0.s[1]\n"
"ldr d19, [x15, #0xc0]\n"
+ "ldr x20, [x15, #0xc8]\n"
"fmla v11.4s, v18.4s, v0.s[1]\n"
"ldr d18, [x15, #0xd0]\n"
- "mov v16.d[1], x20\n"
- "ldr x20, [x15, #0xc8]\n"
"fmla v12.4s, v17.4s, v0.s[1]\n"
"ldr d17, [x15, #0xe0]\n"
"fmla v13.4s, v16.4s, v0.s[1]\n"
"ldr d16, [x15, #0xf0]\n"
- "cmp x12, #0x8\n"
- "ldr x22, [x15, #0xd8]\n"
"mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "mov v18.d[1], x20\n"
"ldr x20, [x15, #0xe8]\n"
- "ldr x21, [x15, #0xf8]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.4s, v19.4s, v0.s[2]\n"
"ldr d19, [x15, #0x100]\n"
- "mov v18.d[1], x22\n"
- "mov v17.d[1], x20\n"
"ldr x20, [x15, #0x108]\n"
- "mov v16.d[1], x21\n"
"fmla v9.4s, v18.4s, v0.s[2]\n"
"ldr d18, [x15, #0x110]\n"
"fmla v10.4s, v17.4s, v0.s[2]\n"
@@ -307,49 +302,52 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v11.4s, v16.4s, v0.s[2]\n"
"ldr d16, [x15, #0x130]\n"
"mov v19.d[1], x20\n"
- "ldr x22, [x15, #0x118]\n"
+ "ldr x20, [x15, #0x118]\n"
+ "mov v18.d[1], x20\n"
"ldr x20, [x15, #0x128]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x138]\n"
+ "mov v16.d[1], x20\n"
"fmla v12.4s, v19.4s, v0.s[2]\n"
"ldr d19, [x15, #0x140]\n"
- "ldr x21, [x15, #0x138]\n"
- "mov v18.d[1], x22\n"
- "mov v17.d[1], x20\n"
"ldr x20, [x15, #0x148]\n"
"fmla v13.4s, v18.4s, v0.s[2]\n"
"ldr d18, [x15, #0x150]\n"
- "mov v16.d[1], x21\n"
- "ldr x22, [x15, #0x158]\n"
"fmla v8.4s, v17.4s, v0.s[3]\n"
"ldr d17, [x15, #0x160]\n"
- "ldr x21, [x15, #0x168]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr d16, [x15, #0x170]\n"
"mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0x158]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0x168]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x15, #0x178]\n"
- "mov v18.d[1], x22\n"
- "mov v17.d[1], x21\n"
+ "mov v16.d[1], x20\n"
+ "add x11, x11, #0x10\n"
"add x15, x15, #0x180\n"
"fmla v10.4s, v19.4s, v0.s[3]\n"
"ldr d4, [x15, #0x0]\n"
- "mov v16.d[1], x20\n"
- "ldr x21, [x15, #0x8]\n"
+ "ldr x20, [x15, #0x8]\n"
"fmla v11.4s, v18.4s, v0.s[3]\n"
"ldr d5, [x15, #0x10]\n"
- "ldr x20, [x15, #0x18]\n"
"fmla v12.4s, v17.4s, v0.s[3]\n"
"ldr d6, [x15, #0x20]\n"
"fmla v13.4s, v16.4s, v0.s[3]\n"
"ldr d0, [x11, #0x0]\n"
- "mov v4.d[1], x21\n"
+ "sub x12, x12, #0x4\n"
"ldr d7, [x15, #0x30]\n"
- "mov v5.d[1], x20\n"
- "ldr x22, [x15, #0x28]\n"
+ "cmp x12, #0x8\n"
+ "ldr x21, [x15, #0x18]\n"
+ "mov v4.d[1], x20\n"
+ "ldr x20, [x15, #0x28]\n"
+ "mov v5.d[1], x21\n"
"ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
"ldr x20, [x15, #0x38]\n"
- "mov v6.d[1], x22\n"
- "prfm pldl1keep, [x11, #0x80]\n"
"mov v0.d[1], x21\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"bge 22b\n"
"23:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
@@ -403,21 +401,21 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"24:" // Height 1: Multiply loop: Main loop skip
"cbz x12, 26f\n"
"25:" // Height 1: Multiply loop: Odd block loop
- "ldr s20, [x11], #0x4\n"
+ "ldr s17, [x11], #0x4\n"
"sub x12, x12, #0x1\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v8.4s, v16.4s, v17.s[0]\n"
"ldr q16, [x15, #0x10]\n"
- "ldr q19, [x15, #0x20]\n"
- "ldr q18, [x15, #0x30]\n"
- "fmla v8.4s, v17.4s, v20.s[0]\n"
- "ldr q17, [x15, #0x40]\n"
- "fmla v9.4s, v16.4s, v20.s[0]\n"
+ "fmla v9.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x20]\n"
+ "fmla v10.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ "fmla v11.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x40]\n"
+ "fmla v12.4s, v16.4s, v17.s[0]\n"
"ldr q16, [x15, #0x50]\n"
- "fmla v10.4s, v19.4s, v20.s[0]\n"
- "fmla v11.4s, v18.4s, v20.s[0]\n"
+ "fmla v13.4s, v16.4s, v17.s[0]\n"
"add x15, x15, #0x60\n"
- "fmla v12.4s, v17.4s, v20.s[0]\n"
- "fmla v13.4s, v16.4s, v20.s[0]\n"
"cbnz x12, 25b\n"
"26:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -426,16 +424,16 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"bne 19b\n"
"prfm pstl1keep, [x14, #0x0]\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v16.4s\n"
+ "fmin v9.4s, v9.4s, v16.4s\n"
+ "fmin v10.4s, v10.4s, v16.4s\n"
+ "fmin v11.4s, v11.4s, v16.4s\n"
+ "fmin v12.4s, v12.4s, v16.4s\n"
+ "fmin v13.4s, v13.4s, v16.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v17.4s\n"
- "fmin v9.4s, v9.4s, v17.4s\n"
- "fmin v10.4s, v10.4s, v17.4s\n"
- "fmin v11.4s, v11.4s, v17.4s\n"
- "fmin v12.4s, v12.4s, v17.4s\n"
- "fmin v13.4s, v13.4s, v17.4s\n"
"fmax v8.4s, v8.4s, v16.4s\n"
"fmax v9.4s, v9.4s, v16.4s\n"
"fmax v10.4s, v10.4s, v16.4s\n"
@@ -531,142 +529,142 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"bgt 2b\n"
"b 166f\n"
"42:" // Height 2
- "ldr x17, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x17, %x[bias]\n"
"ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
"43:" // Height 2: Column loop
"cbz x17, 44f\n"
"ldr q8, [x17, #0x0]\n"
- "ldr q9, [x17, #0x10]\n"
- "ldr q10, [x17, #0x20]\n"
- "ldr q11, [x17, #0x30]\n"
"mov v14.16b, v8.16b\n"
- "ldr q12, [x17, #0x40]\n"
+ "ldr q9, [x17, #0x10]\n"
"mov v15.16b, v9.16b\n"
- "ldr q13, [x17, #0x50]\n"
+ "ldr q10, [x17, #0x20]\n"
"mov v16.16b, v10.16b\n"
+ "ldr q11, [x17, #0x30]\n"
"mov v17.16b, v11.16b\n"
- "add x17, x17, #0x60\n"
+ "ldr q12, [x17, #0x40]\n"
"mov v18.16b, v12.16b\n"
+ "ldr q13, [x17, #0x50]\n"
"mov v19.16b, v13.16b\n"
+ "add x17, x17, #0x60\n"
"b 59f\n"
"44:" // Height 2: no bias
"tbz %x[flags], #0, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x16, #0x18\n"
- "add x24, x14, x20, LSL #2\n"
+ "add x23, x14, x20, LSL #2\n"
"bge 57f\n"
"tbz x16, #4, 48f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x14], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x14], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v11.4s }, [x14], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
"tbz x16, #2, 46f\n"
"ld1 { v12.4s }, [x14], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
"tbz x16, #1, 45f\n"
"ldr d13, [x14], #0x8\n"
"mov x20, #0x58\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"tbz x16, #0, 56f\n"
"ld1 { v13.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
"b 56f\n"
"45:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x16, #0, 56f\n"
"ldr s13, [x14, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
"b 56f\n"
"46:" // Height 2: Partial accumulate: partial_2_16
"tbz x16, #1, 47f\n"
"ldr d12, [x14], #0x8\n"
"mov x20, #0x48\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"tbz x16, #0, 56f\n"
"ld1 { v12.s }[2], [x14]\n"
- "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
"b 56f\n"
"47:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x16, #0, 56f\n"
"ldr s12, [x14, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
"b 56f\n"
"48:" // Height 2: Partial accumulate: partial_8_0
"tbz x16, #3, 52f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x14], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"tbz x16, #2, 50f\n"
"ld1 { v10.4s }, [x14], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"tbz x16, #1, 49f\n"
"ldr d11, [x14], #0x8\n"
"mov x20, #0x38\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"tbz x16, #0, 56f\n"
"ld1 { v11.s }[2], [x14]\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
"b 56f\n"
"49:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x16, #0, 56f\n"
"ldr s11, [x14, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
"b 56f\n"
"50:" // Height 2: Partial accumulate: partial_2_8
"tbz x16, #1, 51f\n"
"ldr d10, [x14], #0x8\n"
"mov x20, #0x28\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"tbz x16, #0, 56f\n"
"ld1 { v10.s }[2], [x14]\n"
- "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
"b 56f\n"
"51:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x16, #0, 56f\n"
"ldr s10, [x14, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
"b 56f\n"
"52:" // Height 2: Partial accumulate: partial_4_0
"tbz x16, #2, 54f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"tbz x16, #1, 53f\n"
"ldr d9, [x14], #0x8\n"
"mov x20, #0x18\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"tbz x16, #0, 56f\n"
"ld1 { v9.s }[2], [x14]\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x23]\n"
"b 56f\n"
"53:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x16, #0, 56f\n"
"ldr s9, [x14, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
"b 56f\n"
"54:" // Height 2: Partial accumulate: partial_2_0
"tbz x16, #1, 55f\n"
"ldr d8, [x14], #0x8\n"
"mov x20, #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"tbz x16, #0, 56f\n"
"ld1 { v8.s }[2], [x14]\n"
- "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x23]\n"
"b 56f\n"
"55:" // Height 2: Partial accumulate: partial_1_0
"ldr s8, [x14, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"56:" // Height 2: Partial accumulate: Done
"sub x14, x14, x20\n"
"b 59f\n"
@@ -677,12 +675,12 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"ldr q11, [x14, #0x30]\n"
"ldr q12, [x14, #0x40]\n"
"ldr q13, [x14, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
"b 59f\n"
"58:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -701,8 +699,8 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"mov x13, #0x0\n"
"60:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
"ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -745,15 +743,15 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v17.4s, v7.4s, v1.s[0]\n"
"ldr d20, [x15, #0x70]\n"
"mov v23.d[1], x23\n"
- "ldr x23, [x15, #0x88]\n"
- "mov v22.d[1], x22\n"
- "ldr x22, [x15, #0x98]\n"
- "mov v21.d[1], x21\n"
"fmla v12.4s, v23.4s, v0.s[0]\n"
- "mov v20.d[1], x20\n"
+ "mov v22.d[1], x22\n"
"fmla v18.4s, v23.4s, v1.s[0]\n"
"ldr d23, [x15, #0x80]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0x88]\n"
"fmla v13.4s, v22.4s, v0.s[0]\n"
+ "ldr x22, [x15, #0x98]\n"
"fmla v19.4s, v22.4s, v1.s[0]\n"
"ldr d22, [x15, #0x90]\n"
"fmla v8.4s, v21.4s, v0.s[1]\n"
@@ -765,15 +763,15 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v15.4s, v20.4s, v1.s[1]\n"
"ldr d20, [x15, #0xb0]\n"
"mov v23.d[1], x23\n"
- "ldr x23, [x15, #0xc8]\n"
- "mov v22.d[1], x22\n"
- "ldr x22, [x15, #0xd8]\n"
- "mov v21.d[1], x21\n"
"fmla v10.4s, v23.4s, v0.s[1]\n"
- "mov v20.d[1], x20\n"
+ "mov v22.d[1], x22\n"
"fmla v16.4s, v23.4s, v1.s[1]\n"
"ldr d23, [x15, #0xc0]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0xc8]\n"
"fmla v11.4s, v22.4s, v0.s[1]\n"
+ "ldr x22, [x15, #0xd8]\n"
"fmla v17.4s, v22.4s, v1.s[1]\n"
"ldr d22, [x15, #0xd0]\n"
"fmla v12.4s, v21.4s, v0.s[1]\n"
@@ -785,15 +783,15 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v19.4s, v20.4s, v1.s[1]\n"
"ldr d20, [x15, #0xf0]\n"
"mov v23.d[1], x23\n"
- "ldr x23, [x15, #0x108]\n"
- "mov v22.d[1], x22\n"
- "ldr x22, [x15, #0x118]\n"
- "mov v21.d[1], x21\n"
"fmla v8.4s, v23.4s, v0.s[2]\n"
- "mov v20.d[1], x20\n"
+ "mov v22.d[1], x22\n"
"fmla v14.4s, v23.4s, v1.s[2]\n"
"ldr d23, [x15, #0x100]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0x108]\n"
"fmla v9.4s, v22.4s, v0.s[2]\n"
+ "ldr x22, [x15, #0x118]\n"
"fmla v15.4s, v22.4s, v1.s[2]\n"
"ldr d22, [x15, #0x110]\n"
"fmla v10.4s, v21.4s, v0.s[2]\n"
@@ -805,15 +803,15 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v17.4s, v20.4s, v1.s[2]\n"
"ldr d20, [x15, #0x130]\n"
"mov v23.d[1], x23\n"
- "ldr x23, [x15, #0x148]\n"
- "mov v22.d[1], x22\n"
- "ldr x22, [x15, #0x158]\n"
- "mov v21.d[1], x21\n"
"fmla v12.4s, v23.4s, v0.s[2]\n"
- "mov v20.d[1], x20\n"
+ "mov v22.d[1], x22\n"
"fmla v18.4s, v23.4s, v1.s[2]\n"
"ldr d23, [x15, #0x140]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0x148]\n"
"fmla v13.4s, v22.4s, v0.s[2]\n"
+ "ldr x22, [x15, #0x158]\n"
"fmla v19.4s, v22.4s, v1.s[2]\n"
"ldr d22, [x15, #0x150]\n"
"fmla v8.4s, v21.4s, v0.s[3]\n"
@@ -944,19 +942,19 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"sub x12, x12, #0x1\n"
"ldr s24, [x10], #0x4\n"
"ldr q21, [x15, #0x0]\n"
- "ldr q20, [x15, #0x10]\n"
- "ldr q23, [x15, #0x20]\n"
- "ldr q22, [x15, #0x30]\n"
"fmla v8.4s, v21.4s, v25.s[0]\n"
+ "ldr q20, [x15, #0x10]\n"
"fmla v14.4s, v21.4s, v24.s[0]\n"
- "ldr q21, [x15, #0x40]\n"
+ "ldr q23, [x15, #0x20]\n"
"fmla v9.4s, v20.4s, v25.s[0]\n"
+ "ldr q22, [x15, #0x30]\n"
"fmla v15.4s, v20.4s, v24.s[0]\n"
- "ldr q20, [x15, #0x50]\n"
+ "ldr q21, [x15, #0x40]\n"
"fmla v10.4s, v23.4s, v25.s[0]\n"
- "add x15, x15, #0x60\n"
+ "ldr q20, [x15, #0x50]\n"
"fmla v16.4s, v23.4s, v24.s[0]\n"
"fmla v11.4s, v22.4s, v25.s[0]\n"
+ "add x15, x15, #0x60\n"
"fmla v17.4s, v22.4s, v24.s[0]\n"
"fmla v12.4s, v21.4s, v25.s[0]\n"
"fmla v18.4s, v21.4s, v24.s[0]\n"
@@ -969,26 +967,26 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"cmp x13, x20\n"
"bne 60b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x14, x20, LSL #2\n"
"prfm pstl1keep, [x14, #0x0]\n"
- "add x24, x14, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 68f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v20.4s\n"
+ "fmin v9.4s, v9.4s, v20.4s\n"
+ "fmin v10.4s, v10.4s, v20.4s\n"
+ "fmin v11.4s, v11.4s, v20.4s\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.4s }, [x21]\n"
"ld1r { v20.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v21.4s\n"
- "fmin v9.4s, v9.4s, v21.4s\n"
- "fmin v10.4s, v10.4s, v21.4s\n"
- "fmin v11.4s, v11.4s, v21.4s\n"
- "fmin v12.4s, v12.4s, v21.4s\n"
- "fmin v13.4s, v13.4s, v21.4s\n"
- "fmin v14.4s, v14.4s, v21.4s\n"
- "fmin v15.4s, v15.4s, v21.4s\n"
- "fmin v16.4s, v16.4s, v21.4s\n"
- "fmin v17.4s, v17.4s, v21.4s\n"
- "fmin v18.4s, v18.4s, v21.4s\n"
- "fmin v19.4s, v19.4s, v21.4s\n"
"fmax v8.4s, v8.4s, v20.4s\n"
"fmax v9.4s, v9.4s, v20.4s\n"
"fmax v10.4s, v10.4s, v20.4s\n"
@@ -1009,99 +1007,99 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"st1 { v9.4s }, [x14], #0x10\n"
"st1 { v10.4s }, [x14], #0x10\n"
"st1 { v11.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
"tbz x16, #2, 70f\n"
"st1 { v12.4s }, [x14], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
"tbz x16, #1, 69f\n"
"str d13, [x14], #0x8\n"
- "str d19, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
"tbz x16, #0, 80f\n"
"st1 { v13.s }[2], [x14]\n"
- "st1 { v19.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
"b 80f\n"
"69:" // Height 2: Partial direct writeback: partial_1_20
"tbz x16, #0, 80f\n"
"str s13, [x14, #0x0]\n"
- "str s19, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
"b 80f\n"
"70:" // Height 2: Partial direct writeback: partial_2_16
"tbz x16, #1, 71f\n"
"str d12, [x14], #0x8\n"
- "str d18, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
"tbz x16, #0, 80f\n"
"st1 { v12.s }[2], [x14]\n"
- "st1 { v18.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
"b 80f\n"
"71:" // Height 2: Partial direct writeback: partial_1_16
"tbz x16, #0, 80f\n"
"str s12, [x14, #0x0]\n"
- "str s18, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
"b 80f\n"
"72:" // Height 2: Partial direct writeback: partial_8_0
"tbz x16, #3, 76f\n"
"st1 { v8.4s }, [x14], #0x10\n"
"st1 { v9.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
"tbz x16, #2, 74f\n"
"st1 { v10.4s }, [x14], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
"tbz x16, #1, 73f\n"
"str d11, [x14], #0x8\n"
- "str d17, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
"tbz x16, #0, 80f\n"
"st1 { v11.s }[2], [x14]\n"
- "st1 { v17.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
"b 80f\n"
"73:" // Height 2: Partial direct writeback: partial_1_12
"tbz x16, #0, 80f\n"
"str s11, [x14, #0x0]\n"
- "str s17, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
"b 80f\n"
"74:" // Height 2: Partial direct writeback: partial_2_8
"tbz x16, #1, 75f\n"
"str d10, [x14], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x16, #0, 80f\n"
"st1 { v10.s }[2], [x14]\n"
- "st1 { v16.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
"b 80f\n"
"75:" // Height 2: Partial direct writeback: partial_1_8
"tbz x16, #0, 80f\n"
"str s10, [x14, #0x0]\n"
- "str s16, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
"b 80f\n"
"76:" // Height 2: Partial direct writeback: partial_4_0
"tbz x16, #2, 78f\n"
"st1 { v8.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
"tbz x16, #1, 77f\n"
"str d9, [x14], #0x8\n"
- "str d15, [x24], #0x8\n"
+ "str d15, [x23], #0x8\n"
"tbz x16, #0, 80f\n"
"st1 { v9.s }[2], [x14]\n"
- "st1 { v15.s }[2], [x24]\n"
+ "st1 { v15.s }[2], [x23]\n"
"b 80f\n"
"77:" // Height 2: Partial direct writeback: partial_1_4
"tbz x16, #0, 80f\n"
"str s9, [x14, #0x0]\n"
- "str s15, [x24, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
"b 80f\n"
"78:" // Height 2: Partial direct writeback: partial_2_0
"tbz x16, #1, 79f\n"
"str d8, [x14], #0x8\n"
- "str d14, [x24], #0x8\n"
+ "str d14, [x23], #0x8\n"
"tbz x16, #0, 80f\n"
"st1 { v8.s }[2], [x14]\n"
- "st1 { v14.s }[2], [x24]\n"
+ "st1 { v14.s }[2], [x23]\n"
"b 80f\n"
"79:" // Height 2: Partial direct writeback: partial_1_0
"str s8, [x14, #0x0]\n"
- "str s14, [x24, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
"80:" // Height 2: Partial direct writeback: Done
"b 82f\n"
"81:" // Height 2: Full writeback
@@ -1112,37 +1110,37 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"str q12, [x14, #0x40]\n"
"str q13, [x14, #0x50]\n"
"add x14, x14, #0x60\n"
- "str q14, [x24, #0x0]\n"
- "str q15, [x24, #0x10]\n"
- "str q16, [x24, #0x20]\n"
- "str q17, [x24, #0x30]\n"
- "str q18, [x24, #0x40]\n"
- "str q19, [x24, #0x50]\n"
+ "str q14, [x23, #0x0]\n"
+ "str q15, [x23, #0x10]\n"
+ "str q16, [x23, #0x20]\n"
+ "str q17, [x23, #0x30]\n"
+ "str q18, [x23, #0x40]\n"
+ "str q19, [x23, #0x50]\n"
"82:" // Height 2: Writeback done
"subs x16, x16, #0x18\n"
"bgt 43b\n"
"b 166f\n"
"83:" // Height 3
- "ldr x17, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x17, %x[bias]\n"
"ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
"84:" // Height 3: Column loop
"cbz x17, 85f\n"
"ldr q8, [x17, #0x0]\n"
- "ldr q9, [x17, #0x10]\n"
- "ldr q10, [x17, #0x20]\n"
- "ldr q11, [x17, #0x30]\n"
"mov v14.16b, v8.16b\n"
- "ldr q12, [x17, #0x40]\n"
+ "ldr q9, [x17, #0x10]\n"
"mov v15.16b, v9.16b\n"
- "ldr q13, [x17, #0x50]\n"
+ "ldr q10, [x17, #0x20]\n"
"mov v16.16b, v10.16b\n"
+ "ldr q11, [x17, #0x30]\n"
"mov v17.16b, v11.16b\n"
- "add x17, x17, #0x60\n"
+ "ldr q12, [x17, #0x40]\n"
"mov v18.16b, v12.16b\n"
+ "ldr q13, [x17, #0x50]\n"
"mov v19.16b, v13.16b\n"
"mov v20.16b, v8.16b\n"
+ "add x17, x17, #0x60\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1152,147 +1150,147 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"85:" // Height 3: no bias
"tbz %x[flags], #0, 99f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x14, x20, LSL #2\n"
"cmp x16, #0x18\n"
- "add x24, x14, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 98f\n"
"tbz x16, #4, 89f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x14], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x14], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v11.4s }, [x14], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
"tbz x16, #2, 87f\n"
"ld1 { v12.4s }, [x14], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x16, #1, 86f\n"
"ldr d13, [x14], #0x8\n"
"mov x20, #0x58\n"
- "ldr d19, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x16, #0, 97f\n"
"ld1 { v13.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 97f\n"
"86:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x16, #0, 97f\n"
"ldr s13, [x14, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 97f\n"
"87:" // Height 3: Partial accumulate: partial_2_16
"tbz x16, #1, 88f\n"
"ldr d12, [x14], #0x8\n"
"mov x20, #0x48\n"
- "ldr d18, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x16, #0, 97f\n"
"ld1 { v12.s }[2], [x14]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 97f\n"
"88:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x16, #0, 97f\n"
"ldr s12, [x14, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"b 97f\n"
"89:" // Height 3: Partial accumulate: partial_8_0
"tbz x16, #3, 93f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x14], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"tbz x16, #2, 91f\n"
"ld1 { v10.4s }, [x14], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"tbz x16, #1, 90f\n"
"ldr d11, [x14], #0x8\n"
"mov x20, #0x38\n"
- "ldr d17, [x24], #0x8\n"
- "ldr d23, [x23], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
"tbz x16, #0, 97f\n"
"ld1 { v11.s }[2], [x14]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
"b 97f\n"
"90:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x16, #0, 97f\n"
"ldr s11, [x14, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
"b 97f\n"
"91:" // Height 3: Partial accumulate: partial_2_8
"tbz x16, #1, 92f\n"
"ldr d10, [x14], #0x8\n"
"mov x20, #0x28\n"
- "ldr d16, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz x16, #0, 97f\n"
"ld1 { v10.s }[2], [x14]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
"b 97f\n"
"92:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x16, #0, 97f\n"
"ldr s10, [x14, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
"b 97f\n"
"93:" // Height 3: Partial accumulate: partial_4_0
"tbz x16, #2, 95f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
"tbz x16, #1, 94f\n"
"ldr d9, [x14], #0x8\n"
"mov x20, #0x18\n"
- "ldr d15, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
"tbz x16, #0, 97f\n"
"ld1 { v9.s }[2], [x14]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
"b 97f\n"
"94:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x16, #0, 97f\n"
"ldr s9, [x14, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s21, [x23, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
"b 97f\n"
"95:" // Height 3: Partial accumulate: partial_2_0
"tbz x16, #1, 96f\n"
"ldr d8, [x14], #0x8\n"
"mov x20, #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "ldr d20, [x23], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
"tbz x16, #0, 97f\n"
"ld1 { v8.s }[2], [x14]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
"b 97f\n"
"96:" // Height 3: Partial accumulate: partial_1_0
"ldr s8, [x14, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s20, [x23, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
"97:" // Height 3: Partial accumulate: Done
"sub x14, x14, x20\n"
"b 100f\n"
@@ -1303,18 +1301,18 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"ldr q11, [x14, #0x30]\n"
"ldr q12, [x14, #0x40]\n"
"ldr q13, [x14, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q20, [x23, #0x0]\n"
- "ldr q21, [x23, #0x10]\n"
- "ldr q22, [x23, #0x20]\n"
- "ldr q23, [x23, #0x30]\n"
- "ldr q24, [x23, #0x40]\n"
- "ldr q25, [x23, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x22, #0x40]\n"
+ "ldr q25, [x22, #0x50]\n"
"b 100f\n"
"99:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -1339,8 +1337,8 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"mov x13, #0x0\n"
"101:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 102f\n"
"ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1385,23 +1383,23 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v10.4s, v6.4s, v0.s[0]\n"
"mov v29.d[1], x23\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "ldr x23, [x15, #0x88]\n"
+ "mov v28.d[1], x22\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
"ldr d27, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v28.d[1], x22\n"
+ "mov v27.d[1], x21\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr x22, [x15, #0x98]\n"
+ "ldr x23, [x15, #0x88]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
"ldr d26, [x15, #0x70]\n"
- "mov v27.d[1], x21\n"
+ "mov v26.d[1], x20\n"
"fmla v12.4s, v29.4s, v0.s[0]\n"
"fmla v18.4s, v29.4s, v1.s[0]\n"
- "ldr x21, [x15, #0xa8]\n"
- "mov v26.d[1], x20\n"
+ "ldr x22, [x15, #0x98]\n"
"fmla v24.4s, v29.4s, v2.s[0]\n"
"ldr d29, [x15, #0x80]\n"
"fmla v13.4s, v28.4s, v0.s[0]\n"
+ "ldr x21, [x15, #0xa8]\n"
"fmla v19.4s, v28.4s, v1.s[0]\n"
"ldr x20, [x15, #0xb8]\n"
"fmla v25.4s, v28.4s, v2.s[0]\n"
@@ -1409,23 +1407,23 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v8.4s, v27.4s, v0.s[1]\n"
"mov v29.d[1], x23\n"
"fmla v14.4s, v27.4s, v1.s[1]\n"
- "ldr x23, [x15, #0xc8]\n"
+ "mov v28.d[1], x22\n"
"fmla v20.4s, v27.4s, v2.s[1]\n"
"ldr d27, [x15, #0xa0]\n"
"fmla v9.4s, v26.4s, v0.s[1]\n"
- "mov v28.d[1], x22\n"
+ "mov v27.d[1], x21\n"
"fmla v15.4s, v26.4s, v1.s[1]\n"
- "ldr x22, [x15, #0xd8]\n"
+ "ldr x23, [x15, #0xc8]\n"
"fmla v21.4s, v26.4s, v2.s[1]\n"
"ldr d26, [x15, #0xb0]\n"
- "mov v27.d[1], x21\n"
+ "mov v26.d[1], x20\n"
"fmla v10.4s, v29.4s, v0.s[1]\n"
"fmla v16.4s, v29.4s, v1.s[1]\n"
- "ldr x21, [x15, #0xe8]\n"
- "mov v26.d[1], x20\n"
+ "ldr x22, [x15, #0xd8]\n"
"fmla v22.4s, v29.4s, v2.s[1]\n"
"ldr d29, [x15, #0xc0]\n"
"fmla v11.4s, v28.4s, v0.s[1]\n"
+ "ldr x21, [x15, #0xe8]\n"
"fmla v17.4s, v28.4s, v1.s[1]\n"
"ldr x20, [x15, #0xf8]\n"
"fmla v23.4s, v28.4s, v2.s[1]\n"
@@ -1433,23 +1431,23 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v12.4s, v27.4s, v0.s[1]\n"
"mov v29.d[1], x23\n"
"fmla v18.4s, v27.4s, v1.s[1]\n"
- "ldr x23, [x15, #0x108]\n"
+ "mov v28.d[1], x22\n"
"fmla v24.4s, v27.4s, v2.s[1]\n"
"ldr d27, [x15, #0xe0]\n"
"fmla v13.4s, v26.4s, v0.s[1]\n"
- "mov v28.d[1], x22\n"
+ "mov v27.d[1], x21\n"
"fmla v19.4s, v26.4s, v1.s[1]\n"
- "ldr x22, [x15, #0x118]\n"
+ "ldr x23, [x15, #0x108]\n"
"fmla v25.4s, v26.4s, v2.s[1]\n"
"ldr d26, [x15, #0xf0]\n"
- "mov v27.d[1], x21\n"
+ "mov v26.d[1], x20\n"
"fmla v8.4s, v29.4s, v0.s[2]\n"
"fmla v14.4s, v29.4s, v1.s[2]\n"
- "ldr x21, [x15, #0x128]\n"
- "mov v26.d[1], x20\n"
+ "ldr x22, [x15, #0x118]\n"
"fmla v20.4s, v29.4s, v2.s[2]\n"
"ldr d29, [x15, #0x100]\n"
"fmla v9.4s, v28.4s, v0.s[2]\n"
+ "ldr x21, [x15, #0x128]\n"
"fmla v15.4s, v28.4s, v1.s[2]\n"
"ldr x20, [x15, #0x138]\n"
"fmla v21.4s, v28.4s, v2.s[2]\n"
@@ -1457,23 +1455,23 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v10.4s, v27.4s, v0.s[2]\n"
"mov v29.d[1], x23\n"
"fmla v16.4s, v27.4s, v1.s[2]\n"
- "ldr x23, [x15, #0x148]\n"
+ "mov v28.d[1], x22\n"
"fmla v22.4s, v27.4s, v2.s[2]\n"
"ldr d27, [x15, #0x120]\n"
"fmla v11.4s, v26.4s, v0.s[2]\n"
- "mov v28.d[1], x22\n"
+ "mov v27.d[1], x21\n"
"fmla v17.4s, v26.4s, v1.s[2]\n"
- "ldr x22, [x15, #0x158]\n"
+ "ldr x23, [x15, #0x148]\n"
"fmla v23.4s, v26.4s, v2.s[2]\n"
"ldr d26, [x15, #0x130]\n"
- "mov v27.d[1], x21\n"
+ "mov v26.d[1], x20\n"
"fmla v12.4s, v29.4s, v0.s[2]\n"
"fmla v18.4s, v29.4s, v1.s[2]\n"
- "ldr x21, [x15, #0x168]\n"
- "mov v26.d[1], x20\n"
+ "ldr x22, [x15, #0x158]\n"
"fmla v24.4s, v29.4s, v2.s[2]\n"
"ldr d29, [x15, #0x140]\n"
"fmla v13.4s, v28.4s, v0.s[2]\n"
+ "ldr x21, [x15, #0x168]\n"
"fmla v19.4s, v28.4s, v1.s[2]\n"
"ldr x20, [x15, #0x178]\n"
"fmla v25.4s, v28.4s, v2.s[2]\n"
@@ -1481,54 +1479,54 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v8.4s, v27.4s, v0.s[3]\n"
"mov v29.d[1], x23\n"
"fmla v14.4s, v27.4s, v1.s[3]\n"
- "add x11, x11, #0x10\n"
+ "mov v28.d[1], x22\n"
"fmla v20.4s, v27.4s, v2.s[3]\n"
"ldr d27, [x15, #0x160]\n"
"fmla v9.4s, v26.4s, v0.s[3]\n"
- "mov v28.d[1], x22\n"
+ "mov v27.d[1], x21\n"
"fmla v15.4s, v26.4s, v1.s[3]\n"
- "add x10, x10, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v21.4s, v26.4s, v2.s[3]\n"
"ldr d26, [x15, #0x170]\n"
- "mov v27.d[1], x21\n"
+ "mov v26.d[1], x20\n"
+ "add x10, x10, #0x10\n"
"add x9, x9, #0x10\n"
"add x15, x15, #0x180\n"
"fmla v10.4s, v29.4s, v0.s[3]\n"
- "mov v26.d[1], x20\n"
+ "ldr x26, [x15, #0x8]\n"
"fmla v16.4s, v29.4s, v1.s[3]\n"
+ "ldr x25, [x15, #0x18]\n"
"fmla v22.4s, v29.4s, v2.s[3]\n"
"ldr d4, [x15, #0x0]\n"
- "ldr x20, [x15, #0x8]\n"
"fmla v11.4s, v28.4s, v0.s[3]\n"
+ "ldr x24, [x15, #0x28]\n"
"fmla v17.4s, v28.4s, v1.s[3]\n"
- "ldr x25, [x15, #0x18]\n"
+ "ldr x23, [x11, #0x8]\n"
"fmla v23.4s, v28.4s, v2.s[3]\n"
"ldr d5, [x15, #0x10]\n"
"fmla v12.4s, v27.4s, v0.s[3]\n"
- "ldr x24, [x15, #0x28]\n"
+ "ldr x22, [x10, #0x8]\n"
"fmla v18.4s, v27.4s, v1.s[3]\n"
- "ldr x23, [x11, #0x8]\n"
+ "ldr x21, [x9, #0x8]\n"
"fmla v24.4s, v27.4s, v2.s[3]\n"
"ldr d6, [x15, #0x20]\n"
"fmla v13.4s, v26.4s, v0.s[3]\n"
"ldr d0, [x11, #0x0]\n"
"fmla v19.4s, v26.4s, v1.s[3]\n"
"ldr d1, [x10, #0x0]\n"
- "ldr x22, [x10, #0x8]\n"
"fmla v25.4s, v26.4s, v2.s[3]\n"
"ldr d2, [x9, #0x0]\n"
- "sub x12, x12, #0x4\n"
"ldr d7, [x15, #0x30]\n"
- "cmp x12, #0x8\n"
- "ldr x21, [x9, #0x8]\n"
- "mov v4.d[1], x20\n"
+ "sub x12, x12, #0x4\n"
"ldr x20, [x15, #0x38]\n"
- "mov v5.d[1], x25\n"
+ "cmp x12, #0x8\n"
"prfm pldl1keep, [x11, #0x80]\n"
- "mov v6.d[1], x24\n"
+ "mov v4.d[1], x26\n"
"prfm pldl1keep, [x10, #0x80]\n"
- "mov v0.d[1], x23\n"
+ "mov v5.d[1], x25\n"
"prfm pldl1keep, [x9, #0x80]\n"
+ "mov v6.d[1], x24\n"
+ "mov v0.d[1], x23\n"
"mov v1.d[1], x22\n"
"mov v2.d[1], x21\n"
"mov v7.d[1], x20\n"
@@ -1642,14 +1640,14 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"ldr s31, [x10], #0x4\n"
"ldr s30, [x9], #0x4\n"
"ldr q27, [x15, #0x0]\n"
- "ldr q26, [x15, #0x10]\n"
- "ldr q29, [x15, #0x20]\n"
- "ldr q28, [x15, #0x30]\n"
"fmla v8.4s, v27.4s, v0.s[0]\n"
+ "ldr q26, [x15, #0x10]\n"
"fmla v14.4s, v27.4s, v31.s[0]\n"
+ "ldr q29, [x15, #0x20]\n"
"fmla v20.4s, v27.4s, v30.s[0]\n"
- "ldr q27, [x15, #0x40]\n"
+ "ldr q28, [x15, #0x30]\n"
"fmla v9.4s, v26.4s, v0.s[0]\n"
+ "ldr q27, [x15, #0x40]\n"
"fmla v15.4s, v26.4s, v31.s[0]\n"
"fmla v21.4s, v26.4s, v30.s[0]\n"
"ldr q26, [x15, #0x50]\n"
@@ -1673,34 +1671,34 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"cmp x13, x20\n"
"bne 101b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x14, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x14, #0x0]\n"
- "add x24, x14, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 109f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v25.4s, v25.4s, v26.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v27.4s }, [x21]\n"
"ld1r { v26.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v27.4s\n"
- "fmin v9.4s, v9.4s, v27.4s\n"
- "fmin v10.4s, v10.4s, v27.4s\n"
- "fmin v11.4s, v11.4s, v27.4s\n"
- "fmin v12.4s, v12.4s, v27.4s\n"
- "fmin v13.4s, v13.4s, v27.4s\n"
- "fmin v14.4s, v14.4s, v27.4s\n"
- "fmin v15.4s, v15.4s, v27.4s\n"
- "fmin v16.4s, v16.4s, v27.4s\n"
- "fmin v17.4s, v17.4s, v27.4s\n"
- "fmin v18.4s, v18.4s, v27.4s\n"
- "fmin v19.4s, v19.4s, v27.4s\n"
- "fmin v20.4s, v20.4s, v27.4s\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "fmin v23.4s, v23.4s, v27.4s\n"
- "fmin v24.4s, v24.4s, v27.4s\n"
- "fmin v25.4s, v25.4s, v27.4s\n"
"fmax v8.4s, v8.4s, v26.4s\n"
"fmax v9.4s, v9.4s, v26.4s\n"
"fmax v10.4s, v10.4s, v26.4s\n"
@@ -1727,126 +1725,126 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"st1 { v9.4s }, [x14], #0x10\n"
"st1 { v10.4s }, [x14], #0x10\n"
"st1 { v11.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
"tbz x16, #2, 111f\n"
"st1 { v12.4s }, [x14], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x16, #1, 110f\n"
"str d13, [x14], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x16, #0, 121f\n"
"st1 { v13.s }[2], [x14]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 121f\n"
"110:" // Height 3: Partial direct writeback: partial_1_20
"tbz x16, #0, 121f\n"
"str s13, [x14, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 121f\n"
"111:" // Height 3: Partial direct writeback: partial_2_16
"tbz x16, #1, 112f\n"
"str d12, [x14], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x16, #0, 121f\n"
"st1 { v12.s }[2], [x14]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 121f\n"
"112:" // Height 3: Partial direct writeback: partial_1_16
"tbz x16, #0, 121f\n"
"str s12, [x14, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"b 121f\n"
"113:" // Height 3: Partial direct writeback: partial_8_0
"tbz x16, #3, 117f\n"
"st1 { v8.4s }, [x14], #0x10\n"
"st1 { v9.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
"tbz x16, #2, 115f\n"
"st1 { v10.4s }, [x14], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
"tbz x16, #1, 114f\n"
"str d11, [x14], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
"tbz x16, #0, 121f\n"
"st1 { v11.s }[2], [x14]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
"b 121f\n"
"114:" // Height 3: Partial direct writeback: partial_1_12
"tbz x16, #0, 121f\n"
"str s11, [x14, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
"b 121f\n"
"115:" // Height 3: Partial direct writeback: partial_2_8
"tbz x16, #1, 116f\n"
"str d10, [x14], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
"tbz x16, #0, 121f\n"
"st1 { v10.s }[2], [x14]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
"b 121f\n"
"116:" // Height 3: Partial direct writeback: partial_1_8
"tbz x16, #0, 121f\n"
"str s10, [x14, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
"b 121f\n"
"117:" // Height 3: Partial direct writeback: partial_4_0
"tbz x16, #2, 119f\n"
"st1 { v8.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
"tbz x16, #1, 118f\n"
"str d9, [x14], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
"tbz x16, #0, 121f\n"
"st1 { v9.s }[2], [x14]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
"b 121f\n"
"118:" // Height 3: Partial direct writeback: partial_1_4
"tbz x16, #0, 121f\n"
"str s9, [x14, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
"b 121f\n"
"119:" // Height 3: Partial direct writeback: partial_2_0
"tbz x16, #1, 120f\n"
"str d8, [x14], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
"tbz x16, #0, 121f\n"
"st1 { v8.s }[2], [x14]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
"b 121f\n"
"120:" // Height 3: Partial direct writeback: partial_1_0
"str s8, [x14, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
"121:" // Height 3: Partial direct writeback: Done
"b 123f\n"
"122:" // Height 3: Full writeback
@@ -1857,18 +1855,18 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"str q12, [x14, #0x40]\n"
"str q13, [x14, #0x50]\n"
"add x14, x14, #0x60\n"
- "str q14, [x24, #0x0]\n"
- "str q15, [x24, #0x10]\n"
- "str q16, [x24, #0x20]\n"
- "str q17, [x24, #0x30]\n"
- "str q18, [x24, #0x40]\n"
- "str q19, [x24, #0x50]\n"
- "str q20, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q22, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q25, [x23, #0x50]\n"
+ "str q14, [x23, #0x0]\n"
+ "str q15, [x23, #0x10]\n"
+ "str q16, [x23, #0x20]\n"
+ "str q17, [x23, #0x30]\n"
+ "str q18, [x23, #0x40]\n"
+ "str q19, [x23, #0x50]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x22, #0x40]\n"
+ "str q25, [x22, #0x50]\n"
"123:" // Height 3: Writeback done
"subs x16, x16, #0x18\n"
"bgt 84b\n"
@@ -1876,28 +1874,27 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"124:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x10\n"
- "ldr x14, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x17, %x[bias]\n"
"ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "madd x20, x21, x20, x14\n"
"ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"125:" // Height 4: Column loop
"cbz x17, 126f\n"
"ldr q8, [x17, #0x0]\n"
- "ldr q9, [x17, #0x10]\n"
- "ldr q10, [x17, #0x20]\n"
- "ldr q11, [x17, #0x30]\n"
"mov v14.16b, v8.16b\n"
- "ldr q12, [x17, #0x40]\n"
+ "ldr q9, [x17, #0x10]\n"
"mov v15.16b, v9.16b\n"
- "ldr q13, [x17, #0x50]\n"
+ "ldr q10, [x17, #0x20]\n"
"mov v16.16b, v10.16b\n"
+ "ldr q11, [x17, #0x30]\n"
"mov v17.16b, v11.16b\n"
- "add x17, x17, #0x60\n"
+ "ldr q12, [x17, #0x40]\n"
"mov v18.16b, v12.16b\n"
+ "ldr q13, [x17, #0x50]\n"
"mov v19.16b, v13.16b\n"
"mov v20.16b, v8.16b\n"
+ "add x17, x17, #0x60\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1913,175 +1910,175 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"126:" // Height 4: no bias
"tbz %x[flags], #0, 140f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x16, #0x18\n"
- "add x24, x14, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x14, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x16, #0x18\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 139f\n"
"tbz x16, #4, 130f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x14], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x14], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v11.4s }, [x14], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x16, #2, 128f\n"
"ld1 { v12.4s }, [x14], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x16, #1, 127f\n"
"ldr d13, [x14], #0x8\n"
"mov x20, #0x58\n"
- "ldr d19, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x16, #0, 138f\n"
"ld1 { v13.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 138f\n"
"127:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x16, #0, 138f\n"
"ldr s13, [x14, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 138f\n"
"128:" // Height 4: Partial accumulate: partial_2_16
"tbz x16, #1, 129f\n"
"ldr d12, [x14], #0x8\n"
"mov x20, #0x48\n"
- "ldr d18, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x16, #0, 138f\n"
"ld1 { v12.s }[2], [x14]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 138f\n"
"129:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x16, #0, 138f\n"
"ldr s12, [x14, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 138f\n"
"130:" // Height 4: Partial accumulate: partial_8_0
"tbz x16, #3, 134f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x14], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"tbz x16, #2, 132f\n"
"ld1 { v10.4s }, [x14], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x16, #1, 131f\n"
"ldr d11, [x14], #0x8\n"
"mov x20, #0x38\n"
- "ldr d17, [x24], #0x8\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x16, #0, 138f\n"
"ld1 { v11.s }[2], [x14]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 138f\n"
"131:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x16, #0, 138f\n"
"ldr s11, [x14, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 138f\n"
"132:" // Height 4: Partial accumulate: partial_2_8
"tbz x16, #1, 133f\n"
"ldr d10, [x14], #0x8\n"
"mov x20, #0x28\n"
- "ldr d16, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x16, #0, 138f\n"
"ld1 { v10.s }[2], [x14]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 138f\n"
"133:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x16, #0, 138f\n"
"ldr s10, [x14, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"b 138f\n"
"134:" // Height 4: Partial accumulate: partial_4_0
"tbz x16, #2, 136f\n"
"ld1 { v8.4s }, [x14], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"tbz x16, #1, 135f\n"
"ldr d9, [x14], #0x8\n"
"mov x20, #0x18\n"
- "ldr d15, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
"tbz x16, #0, 138f\n"
"ld1 { v9.s }[2], [x14]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
- "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
"b 138f\n"
"135:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x16, #0, 138f\n"
"ldr s9, [x14, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s21, [x23, #0x0]\n"
- "ldr s27, [x22, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
"b 138f\n"
"136:" // Height 4: Partial accumulate: partial_2_0
"tbz x16, #1, 137f\n"
"ldr d8, [x14], #0x8\n"
"mov x20, #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "ldr d20, [x23], #0x8\n"
- "ldr d26, [x22], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
"tbz x16, #0, 138f\n"
"ld1 { v8.s }[2], [x14]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v20.s }[2], [x23]\n"
- "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
"b 138f\n"
"137:" // Height 4: Partial accumulate: partial_1_0
"ldr s8, [x14, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s20, [x23, #0x0]\n"
- "ldr s26, [x22, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
"138:" // Height 4: Partial accumulate: Done
"sub x14, x14, x20\n"
"b 141f\n"
@@ -2092,24 +2089,24 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"ldr q11, [x14, #0x30]\n"
"ldr q12, [x14, #0x40]\n"
"ldr q13, [x14, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q20, [x23, #0x0]\n"
- "ldr q21, [x23, #0x10]\n"
- "ldr q22, [x23, #0x20]\n"
- "ldr q23, [x23, #0x30]\n"
- "ldr q24, [x23, #0x40]\n"
- "ldr q25, [x23, #0x50]\n"
- "ldr q26, [x22, #0x0]\n"
- "ldr q27, [x22, #0x10]\n"
- "ldr q28, [x22, #0x20]\n"
- "ldr q29, [x22, #0x30]\n"
- "ldr q30, [x22, #0x40]\n"
- "ldr q31, [x22, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x22, #0x40]\n"
+ "ldr q25, [x22, #0x50]\n"
+ "ldr q26, [x21, #0x0]\n"
+ "ldr q27, [x21, #0x10]\n"
+ "ldr q28, [x21, #0x20]\n"
+ "ldr q29, [x21, #0x30]\n"
+ "ldr q30, [x21, #0x40]\n"
+ "ldr q31, [x21, #0x50]\n"
"b 141f\n"
"140:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -2140,8 +2137,8 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"mov x13, #0x0\n"
"142:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 143f\n"
"ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2186,169 +2183,169 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v9.4s, v5.4s, v0.s[0]\n"
"ldr x20, [x15, #0x78]\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "add x11, x11, #0x10\n"
- "fmla v21.4s, v5.4s, v2.s[0]\n"
"mov v4.d[1], x23\n"
+ "fmla v21.4s, v5.4s, v2.s[0]\n"
+ "ldr x23, [x15, #0x88]\n"
"fmla v27.4s, v5.4s, v3.s[0]\n"
"ldr d5, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr x23, [x15, #0x88]\n"
+ "mov v5.d[1], x22\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
+ "ldr x22, [x15, #0x98]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
- "mov v5.d[1], x22\n"
+ "add x11, x11, #0x10\n"
"fmla v28.4s, v6.4s, v3.s[0]\n"
"ldr d6, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr x22, [x15, #0x98]\n"
+ "mov v6.d[1], x21\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "add x9, x9, #0x10\n"
+ "ldr x21, [x15, #0xa8]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
- "mov v6.d[1], x21\n"
+ "add x10, x10, #0x10\n"
"fmla v29.4s, v7.4s, v3.s[0]\n"
"ldr d7, [x15, #0x70]\n"
+ "mov v7.d[1], x20\n"
"fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr x21, [x15, #0xa8]\n"
"fmla v18.4s, v4.4s, v1.s[0]\n"
- "add x28, x28, #0x10\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v24.4s, v4.4s, v2.s[0]\n"
+ "add x9, x9, #0x10\n"
"fmla v30.4s, v4.4s, v3.s[0]\n"
"ldr d4, [x15, #0x80]\n"
"fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr x20, [x15, #0xb8]\n"
+ "mov v4.d[1], x23\n"
"fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr x27, [x11, #0x8]\n"
+ "ldr x23, [x15, #0xc8]\n"
"fmla v25.4s, v5.4s, v2.s[0]\n"
- "mov v4.d[1], x23\n"
+ "add x28, x28, #0x10\n"
"fmla v31.4s, v5.4s, v3.s[0]\n"
"ldr d5, [x15, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr x23, [x15, #0xc8]\n"
+ "mov v5.d[1], x22\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr x26, [x10, #0x8]\n"
+ "ldr x22, [x15, #0xd8]\n"
"fmla v20.4s, v6.4s, v2.s[1]\n"
- "mov v5.d[1], x22\n"
+ "ldr x27, [x11, #0x8]\n"
"fmla v26.4s, v6.4s, v3.s[1]\n"
"ldr d6, [x15, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr x22, [x15, #0xd8]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x25, [x9, #0x8]\n"
+ "ldr x21, [x15, #0xe8]\n"
"fmla v21.4s, v7.4s, v2.s[1]\n"
- "mov v6.d[1], x21\n"
+ "ldr x26, [x10, #0x8]\n"
"fmla v27.4s, v7.4s, v3.s[1]\n"
"ldr d7, [x15, #0xb0]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v4.4s, v0.s[1]\n"
- "ldr x21, [x15, #0xe8]\n"
"fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr x24, [x28, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v22.4s, v4.4s, v2.s[1]\n"
+ "ldr x25, [x9, #0x8]\n"
"fmla v28.4s, v4.4s, v3.s[1]\n"
"ldr d4, [x15, #0xc0]\n"
"fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr x20, [x15, #0xf8]\n"
+ "mov v4.d[1], x23\n"
"fmla v17.4s, v5.4s, v1.s[1]\n"
- "sub x12, x12, #0x4\n"
+ "ldr x23, [x15, #0x108]\n"
"fmla v23.4s, v5.4s, v2.s[1]\n"
- "mov v4.d[1], x23\n"
+ "ldr x24, [x28, #0x8]\n"
"fmla v29.4s, v5.4s, v3.s[1]\n"
"ldr d5, [x15, #0xd0]\n"
"fmla v12.4s, v6.4s, v0.s[1]\n"
- "ldr x23, [x15, #0x108]\n"
+ "mov v5.d[1], x22\n"
"fmla v18.4s, v6.4s, v1.s[1]\n"
- "cmp x12, #0x8\n"
+ "ldr x22, [x15, #0x118]\n"
"fmla v24.4s, v6.4s, v2.s[1]\n"
- "mov v5.d[1], x22\n"
+ "sub x12, x12, #0x4\n"
"fmla v30.4s, v6.4s, v3.s[1]\n"
"ldr d6, [x15, #0xe0]\n"
"fmla v13.4s, v7.4s, v0.s[1]\n"
- "ldr x22, [x15, #0x118]\n"
+ "mov v6.d[1], x21\n"
"fmla v19.4s, v7.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x15, #0x128]\n"
"fmla v25.4s, v7.4s, v2.s[1]\n"
- "mov v6.d[1], x21\n"
+ "cmp x12, #0x8\n"
"fmla v31.4s, v7.4s, v3.s[1]\n"
"ldr d7, [x15, #0xf0]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v4.4s, v0.s[2]\n"
- "ldr x21, [x15, #0x128]\n"
"fmla v14.4s, v4.4s, v1.s[2]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0x138]\n"
"fmla v20.4s, v4.4s, v2.s[2]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v26.4s, v4.4s, v3.s[2]\n"
"ldr d4, [x15, #0x100]\n"
"fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr x20, [x15, #0x138]\n"
+ "mov v4.d[1], x23\n"
"fmla v15.4s, v5.4s, v1.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x23, [x15, #0x148]\n"
"fmla v21.4s, v5.4s, v2.s[2]\n"
- "mov v4.d[1], x23\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v27.4s, v5.4s, v3.s[2]\n"
"ldr d5, [x15, #0x110]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr x23, [x15, #0x148]\n"
+ "mov v5.d[1], x22\n"
"fmla v16.4s, v6.4s, v1.s[2]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr x22, [x15, #0x158]\n"
"fmla v22.4s, v6.4s, v2.s[2]\n"
- "mov v5.d[1], x22\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v28.4s, v6.4s, v3.s[2]\n"
"ldr d6, [x15, #0x120]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr x22, [x15, #0x158]\n"
+ "mov v6.d[1], x21\n"
"fmla v17.4s, v7.4s, v1.s[2]\n"
+ "ldr x21, [x15, #0x168]\n"
"fmla v23.4s, v7.4s, v2.s[2]\n"
- "mov v6.d[1], x21\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v29.4s, v7.4s, v3.s[2]\n"
"ldr d7, [x15, #0x130]\n"
+ "mov v7.d[1], x20\n"
"fmla v12.4s, v4.4s, v0.s[2]\n"
- "ldr x21, [x15, #0x168]\n"
"fmla v18.4s, v4.4s, v1.s[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0x178]\n"
"fmla v24.4s, v4.4s, v2.s[2]\n"
"fmla v30.4s, v4.4s, v3.s[2]\n"
"ldr d4, [x15, #0x140]\n"
"fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr x20, [x15, #0x178]\n"
+ "mov v4.d[1], x23\n"
"fmla v19.4s, v5.4s, v1.s[2]\n"
"fmla v25.4s, v5.4s, v2.s[2]\n"
- "mov v4.d[1], x23\n"
"fmla v31.4s, v5.4s, v3.s[2]\n"
"ldr d5, [x15, #0x150]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
+ "mov v5.d[1], x22\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v20.4s, v6.4s, v2.s[3]\n"
- "mov v5.d[1], x22\n"
"fmla v26.4s, v6.4s, v3.s[3]\n"
"ldr d6, [x15, #0x160]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
"fmla v21.4s, v7.4s, v2.s[3]\n"
- "mov v6.d[1], x21\n"
"fmla v27.4s, v7.4s, v3.s[3]\n"
"ldr d7, [x15, #0x170]\n"
+ "mov v7.d[1], x20\n"
"add x15, x15, #0x180\n"
"fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
"ldr x23, [x15, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "fmla v16.4s, v4.4s, v1.s[3]\n"
+ "ldr x22, [x15, #0x18]\n"
"fmla v22.4s, v4.4s, v2.s[3]\n"
+ "ldr x21, [x15, #0x28]\n"
"fmla v28.4s, v4.4s, v3.s[3]\n"
"ldr d4, [x15, #0x0]\n"
"fmla v11.4s, v5.4s, v0.s[3]\n"
- "ldr x22, [x15, #0x18]\n"
+ "ldr x20, [x15, #0x38]\n"
"fmla v17.4s, v5.4s, v1.s[3]\n"
- "ldr x21, [x15, #0x28]\n"
+ "mov v4.d[1], x23\n"
"fmla v23.4s, v5.4s, v2.s[3]\n"
- "ldr x20, [x15, #0x38]\n"
"fmla v29.4s, v5.4s, v3.s[3]\n"
"ldr d5, [x15, #0x10]\n"
"fmla v12.4s, v6.4s, v0.s[3]\n"
- "mov v4.d[1], x23\n"
+ "mov v5.d[1], x22\n"
"fmla v18.4s, v6.4s, v1.s[3]\n"
"fmla v24.4s, v6.4s, v2.s[3]\n"
- "mov v5.d[1], x22\n"
"fmla v30.4s, v6.4s, v3.s[3]\n"
"ldr d6, [x15, #0x20]\n"
"fmla v13.4s, v7.4s, v0.s[3]\n"
@@ -2503,12 +2500,12 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"ldr s5, [x9], #0x4\n"
"ldr s4, [x28], #0x4\n"
"ldr q1, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q3, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
"fmla v8.4s, v1.4s, v7.s[0]\n"
+ "ldr q0, [x15, #0x10]\n"
"fmla v14.4s, v1.4s, v6.s[0]\n"
+ "ldr q3, [x15, #0x20]\n"
"fmla v20.4s, v1.4s, v5.s[0]\n"
+ "ldr q2, [x15, #0x30]\n"
"fmla v26.4s, v1.4s, v4.s[0]\n"
"ldr q1, [x15, #0x40]\n"
"fmla v9.4s, v0.4s, v7.s[0]\n"
@@ -2540,42 +2537,42 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"cmp x13, x20\n"
"bne 142b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x24, x14, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x14, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 150f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmin v24.4s, v24.4s, v1.4s\n"
- "fmin v25.4s, v25.4s, v1.4s\n"
- "fmin v26.4s, v26.4s, v1.4s\n"
- "fmin v27.4s, v27.4s, v1.4s\n"
- "fmin v28.4s, v28.4s, v1.4s\n"
- "fmin v29.4s, v29.4s, v1.4s\n"
- "fmin v30.4s, v30.4s, v1.4s\n"
- "fmin v31.4s, v31.4s, v1.4s\n"
"fmax v8.4s, v8.4s, v0.4s\n"
"fmax v9.4s, v9.4s, v0.4s\n"
"fmax v10.4s, v10.4s, v0.4s\n"
@@ -2608,153 +2605,153 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"st1 { v9.4s }, [x14], #0x10\n"
"st1 { v10.4s }, [x14], #0x10\n"
"st1 { v11.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
- "st1 { v27.4s }, [x22], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "st1 { v27.4s }, [x21], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
"tbz x16, #2, 152f\n"
"st1 { v12.4s }, [x14], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x22], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
"tbz x16, #1, 151f\n"
"str d13, [x14], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x16, #0, 162f\n"
"st1 { v13.s }[2], [x14]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 162f\n"
"151:" // Height 4: Partial direct writeback: partial_1_20
"tbz x16, #0, 162f\n"
"str s13, [x14, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"b 162f\n"
"152:" // Height 4: Partial direct writeback: partial_2_16
"tbz x16, #1, 153f\n"
"str d12, [x14], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x16, #0, 162f\n"
"st1 { v12.s }[2], [x14]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
"b 162f\n"
"153:" // Height 4: Partial direct writeback: partial_1_16
"tbz x16, #0, 162f\n"
"str s12, [x14, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
"b 162f\n"
"154:" // Height 4: Partial direct writeback: partial_8_0
"tbz x16, #3, 158f\n"
"st1 { v8.4s }, [x14], #0x10\n"
"st1 { v9.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
- "st1 { v27.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "st1 { v27.4s }, [x21], #0x10\n"
"tbz x16, #2, 156f\n"
"st1 { v10.4s }, [x14], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
"tbz x16, #1, 155f\n"
"str d11, [x14], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x16, #0, 162f\n"
"st1 { v11.s }[2], [x14]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x22]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
"b 162f\n"
"155:" // Height 4: Partial direct writeback: partial_1_12
"tbz x16, #0, 162f\n"
"str s11, [x14, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
- "str s29, [x22, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
"b 162f\n"
"156:" // Height 4: Partial direct writeback: partial_2_8
"tbz x16, #1, 157f\n"
"str d10, [x14], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x16, #0, 162f\n"
"st1 { v10.s }[2], [x14]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x22]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
"b 162f\n"
"157:" // Height 4: Partial direct writeback: partial_1_8
"tbz x16, #0, 162f\n"
"str s10, [x14, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
- "str s28, [x22, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
"b 162f\n"
"158:" // Height 4: Partial direct writeback: partial_4_0
"tbz x16, #2, 160f\n"
"st1 { v8.4s }, [x14], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
"tbz x16, #1, 159f\n"
"str d9, [x14], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
- "str d27, [x22], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
"tbz x16, #0, 162f\n"
"st1 { v9.s }[2], [x14]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
- "st1 { v27.s }[2], [x22]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
"b 162f\n"
"159:" // Height 4: Partial direct writeback: partial_1_4
"tbz x16, #0, 162f\n"
"str s9, [x14, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
- "str s27, [x22, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
"b 162f\n"
"160:" // Height 4: Partial direct writeback: partial_2_0
"tbz x16, #1, 161f\n"
"str d8, [x14], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
- "str d26, [x22], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
"tbz x16, #0, 162f\n"
"st1 { v8.s }[2], [x14]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
- "st1 { v26.s }[2], [x22]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
"b 162f\n"
"161:" // Height 4: Partial direct writeback: partial_1_0
"str s8, [x14, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
- "str s26, [x22, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
"162:" // Height 4: Partial direct writeback: Done
"b 164f\n"
"163:" // Height 4: Full writeback
@@ -2765,24 +2762,24 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"str q12, [x14, #0x40]\n"
"str q13, [x14, #0x50]\n"
"add x14, x14, #0x60\n"
- "str q14, [x24, #0x0]\n"
- "str q15, [x24, #0x10]\n"
- "str q16, [x24, #0x20]\n"
- "str q17, [x24, #0x30]\n"
- "str q18, [x24, #0x40]\n"
- "str q19, [x24, #0x50]\n"
- "str q20, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q22, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q25, [x23, #0x50]\n"
- "str q26, [x22, #0x0]\n"
- "str q27, [x22, #0x10]\n"
- "str q28, [x22, #0x20]\n"
- "str q29, [x22, #0x30]\n"
- "str q30, [x22, #0x40]\n"
- "str q31, [x22, #0x50]\n"
+ "str q14, [x23, #0x0]\n"
+ "str q15, [x23, #0x10]\n"
+ "str q16, [x23, #0x20]\n"
+ "str q17, [x23, #0x30]\n"
+ "str q18, [x23, #0x40]\n"
+ "str q19, [x23, #0x50]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x22, #0x40]\n"
+ "str q25, [x22, #0x50]\n"
+ "str q26, [x21, #0x0]\n"
+ "str q27, [x21, #0x10]\n"
+ "str q28, [x21, #0x20]\n"
+ "str q29, [x21, #0x30]\n"
+ "str q30, [x21, #0x40]\n"
+ "str q31, [x21, #0x50]\n"
"164:" // Height 4: Writeback done
"subs x16, x16, #0x18\n"
"bgt 125b\n"
@@ -2798,8 +2795,8 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"166:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
index 38acdd5054..f5863ba348 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp32_mla_4x24 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp32_mla_4x24 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -100,10 +98,10 @@ void a64_hybrid_fp32_mla_4x24 (
"cmp %x[M], #0x2\n"
"bgt 83f\n"
"beq 42f\n"
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x10, 3f\n"
"ldr q8, [x10, #0x0]\n"
@@ -223,8 +221,8 @@ void a64_hybrid_fp32_mla_4x24 (
"mov x26, #0x0\n"
"19:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -254,14 +252,10 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q17, [x28, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x28, #0x70]\n"
- "sub x25, x25, #0x4\n"
- "add x24, x24, #0x10\n"
"fmla v12.4s, v19.4s, v0.s[0]\n"
"ldr q19, [x28, #0x80]\n"
"fmla v13.4s, v18.4s, v0.s[0]\n"
"ldr q18, [x28, #0x90]\n"
- "cmp x25, #0x8\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"ldr q17, [x28, #0xa0]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
@@ -290,16 +284,20 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q17, [x28, #0x160]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x28, #0x170]\n"
- "add x28, x28, #0x180\n"
+ "sub x25, x25, #0x4\n"
+ "add x24, x24, #0x10\n"
"fmla v10.4s, v19.4s, v0.s[3]\n"
- "ldr q4, [x28, #0x0]\n"
"fmla v11.4s, v18.4s, v0.s[3]\n"
+ "cmp x25, #0x8\n"
+ "add x28, x28, #0x180\n"
+ "ldr q4, [x28, #0x0]\n"
"ldr q5, [x28, #0x10]\n"
"fmla v12.4s, v17.4s, v0.s[3]\n"
"ldr q6, [x28, #0x20]\n"
"fmla v13.4s, v16.4s, v0.s[3]\n"
"ldr q0, [x24, #0x0]\n"
"ldr q7, [x28, #0x30]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"bge 22b\n"
"23:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
@@ -310,13 +308,10 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q17, [x28, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x28, #0x70]\n"
- "add x24, x24, #0x10\n"
- "sub x25, x25, #0x4\n"
"fmla v12.4s, v19.4s, v0.s[0]\n"
"ldr q19, [x28, #0x80]\n"
"fmla v13.4s, v18.4s, v0.s[0]\n"
"ldr q18, [x28, #0x90]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"ldr q17, [x28, #0xa0]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
@@ -345,29 +340,32 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q17, [x28, #0x160]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x28, #0x170]\n"
- "add x28, x28, #0x180\n"
+ "add x24, x24, #0x10\n"
+ "sub x25, x25, #0x4\n"
"fmla v10.4s, v19.4s, v0.s[3]\n"
"fmla v11.4s, v18.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x28, x28, #0x180\n"
"fmla v12.4s, v17.4s, v0.s[3]\n"
"fmla v13.4s, v16.4s, v0.s[3]\n"
"24:" // Height 1: Multiply loop: Main loop skip
"cbz x25, 26f\n"
"25:" // Height 1: Multiply loop: Odd block loop
- "ldr s20, [x24], #0x4\n"
- "ldr q17, [x28, #0x0]\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr q16, [x28, #0x0]\n"
+ "fmla v8.4s, v16.4s, v18.s[0]\n"
"sub x25, x25, #0x1\n"
- "ldr q16, [x28, #0x10]\n"
- "ldr q19, [x28, #0x20]\n"
- "ldr q18, [x28, #0x30]\n"
- "fmla v8.4s, v17.4s, v20.s[0]\n"
- "ldr q17, [x28, #0x40]\n"
- "fmla v9.4s, v16.4s, v20.s[0]\n"
+ "ldr q17, [x28, #0x10]\n"
+ "ldr q16, [x28, #0x20]\n"
+ "fmla v9.4s, v17.4s, v18.s[0]\n"
+ "fmla v10.4s, v16.4s, v18.s[0]\n"
+ "ldr q17, [x28, #0x30]\n"
+ "ldr q16, [x28, #0x40]\n"
+ "fmla v11.4s, v17.4s, v18.s[0]\n"
+ "fmla v12.4s, v16.4s, v18.s[0]\n"
"ldr q16, [x28, #0x50]\n"
+ "fmla v13.4s, v16.4s, v18.s[0]\n"
"add x28, x28, #0x60\n"
- "fmla v10.4s, v19.4s, v20.s[0]\n"
- "fmla v11.4s, v18.4s, v20.s[0]\n"
- "fmla v12.4s, v17.4s, v20.s[0]\n"
- "fmla v13.4s, v16.4s, v20.s[0]\n"
"cbnz x25, 25b\n"
"26:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -376,9 +374,9 @@ void a64_hybrid_fp32_mla_4x24 (
"bne 19b\n"
"prfm pstl1keep, [x27, #0x0]\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -481,141 +479,141 @@ void a64_hybrid_fp32_mla_4x24 (
"bgt 2b\n"
"b 166f\n"
"42:" // Height 2
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"43:" // Height 2: Column loop
"cbz x10, 44f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"mov v14.16b, v8.16b\n"
"mov v15.16b, v9.16b\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"mov v16.16b, v10.16b\n"
"mov v17.16b, v11.16b\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v13.16b\n"
+ "add x10, x10, #0x60\n"
"b 59f\n"
"44:" // Height 2: no bias
"tbz %x[flags], #0, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x9, #0x18\n"
- "add x24, x27, x20, LSL #2\n"
+ "add x23, x27, x20, LSL #2\n"
"bge 57f\n"
"tbz x9, #4, 48f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
"tbz x9, #2, 46f\n"
"ld1 { v12.4s }, [x27], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
"tbz x9, #1, 45f\n"
"ldr d13, [x27], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
"tbz x9, #0, 56f\n"
"ld1 { v13.s }[2], [x27]\n"
- "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
"b 56f\n"
"45:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x9, #0, 56f\n"
"ldr s13, [x27, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
"b 56f\n"
"46:" // Height 2: Partial accumulate: partial_2_16
"tbz x9, #1, 47f\n"
"ldr d12, [x27], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
"tbz x9, #0, 56f\n"
"ld1 { v12.s }[2], [x27]\n"
- "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
"b 56f\n"
"47:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x9, #0, 56f\n"
"ldr s12, [x27, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
"b 56f\n"
"48:" // Height 2: Partial accumulate: partial_8_0
"tbz x9, #3, 52f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"tbz x9, #2, 50f\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"tbz x9, #1, 49f\n"
"ldr d11, [x27], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
"tbz x9, #0, 56f\n"
"ld1 { v11.s }[2], [x27]\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
"b 56f\n"
"49:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x9, #0, 56f\n"
"ldr s11, [x27, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
"b 56f\n"
"50:" // Height 2: Partial accumulate: partial_2_8
"tbz x9, #1, 51f\n"
"ldr d10, [x27], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
"tbz x9, #0, 56f\n"
"ld1 { v10.s }[2], [x27]\n"
- "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
"b 56f\n"
"51:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x9, #0, 56f\n"
"ldr s10, [x27, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
"b 56f\n"
"52:" // Height 2: Partial accumulate: partial_4_0
"tbz x9, #2, 54f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"tbz x9, #1, 53f\n"
"ldr d9, [x27], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
"tbz x9, #0, 56f\n"
"ld1 { v9.s }[2], [x27]\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x23]\n"
"b 56f\n"
"53:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x9, #0, 56f\n"
"ldr s9, [x27, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
"b 56f\n"
"54:" // Height 2: Partial accumulate: partial_2_0
"tbz x9, #1, 55f\n"
"ldr d8, [x27], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
"tbz x9, #0, 56f\n"
"ld1 { v8.s }[2], [x27]\n"
- "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x23]\n"
"b 56f\n"
"55:" // Height 2: Partial accumulate: partial_1_0
"ldr s8, [x27, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
"56:" // Height 2: Partial accumulate: Done
"sub x27, x27, x20\n"
@@ -627,12 +625,12 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q11, [x27, #0x30]\n"
"ldr q12, [x27, #0x40]\n"
"ldr q13, [x27, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
"b 59f\n"
"58:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -651,8 +649,8 @@ void a64_hybrid_fp32_mla_4x24 (
"mov x26, #0x0\n"
"60:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -843,19 +841,19 @@ void a64_hybrid_fp32_mla_4x24 (
"sub x25, x25, #0x1\n"
"ldr q21, [x28, #0x0]\n"
"ldr q20, [x28, #0x10]\n"
- "ldr q23, [x28, #0x20]\n"
- "ldr q22, [x28, #0x30]\n"
"fmla v8.4s, v21.4s, v25.s[0]\n"
"fmla v14.4s, v21.4s, v24.s[0]\n"
- "ldr q21, [x28, #0x40]\n"
+ "ldr q23, [x28, #0x20]\n"
+ "ldr q22, [x28, #0x30]\n"
"fmla v9.4s, v20.4s, v25.s[0]\n"
"fmla v15.4s, v20.4s, v24.s[0]\n"
+ "ldr q21, [x28, #0x40]\n"
"ldr q20, [x28, #0x50]\n"
- "add x28, x28, #0x60\n"
"fmla v10.4s, v23.4s, v25.s[0]\n"
"fmla v16.4s, v23.4s, v24.s[0]\n"
"fmla v11.4s, v22.4s, v25.s[0]\n"
"fmla v17.4s, v22.4s, v24.s[0]\n"
+ "add x28, x28, #0x60\n"
"fmla v12.4s, v21.4s, v25.s[0]\n"
"fmla v18.4s, v21.4s, v24.s[0]\n"
"fmla v13.4s, v20.4s, v25.s[0]\n"
@@ -867,13 +865,13 @@ void a64_hybrid_fp32_mla_4x24 (
"cmp x26, x20\n"
"bne 60b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 68f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.4s }, [x21]\n"
"ld1r { v20.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v21.4s\n"
"fmin v9.4s, v9.4s, v21.4s\n"
@@ -907,99 +905,99 @@ void a64_hybrid_fp32_mla_4x24 (
"st1 { v9.4s }, [x27], #0x10\n"
"st1 { v10.4s }, [x27], #0x10\n"
"st1 { v11.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
"tbz x9, #2, 70f\n"
"st1 { v12.4s }, [x27], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
"tbz x9, #1, 69f\n"
"str d13, [x27], #0x8\n"
- "str d19, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
"tbz x9, #0, 80f\n"
"st1 { v13.s }[2], [x27]\n"
- "st1 { v19.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
"b 80f\n"
"69:" // Height 2: Partial direct writeback: partial_1_20
"tbz x9, #0, 80f\n"
"str s13, [x27, #0x0]\n"
- "str s19, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
"b 80f\n"
"70:" // Height 2: Partial direct writeback: partial_2_16
"tbz x9, #1, 71f\n"
"str d12, [x27], #0x8\n"
- "str d18, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
"tbz x9, #0, 80f\n"
"st1 { v12.s }[2], [x27]\n"
- "st1 { v18.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
"b 80f\n"
"71:" // Height 2: Partial direct writeback: partial_1_16
"tbz x9, #0, 80f\n"
"str s12, [x27, #0x0]\n"
- "str s18, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
"b 80f\n"
"72:" // Height 2: Partial direct writeback: partial_8_0
"tbz x9, #3, 76f\n"
"st1 { v8.4s }, [x27], #0x10\n"
"st1 { v9.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
"tbz x9, #2, 74f\n"
"st1 { v10.4s }, [x27], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
"tbz x9, #1, 73f\n"
"str d11, [x27], #0x8\n"
- "str d17, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
"tbz x9, #0, 80f\n"
"st1 { v11.s }[2], [x27]\n"
- "st1 { v17.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
"b 80f\n"
"73:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 80f\n"
"str s11, [x27, #0x0]\n"
- "str s17, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
"b 80f\n"
"74:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 75f\n"
"str d10, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x9, #0, 80f\n"
"st1 { v10.s }[2], [x27]\n"
- "st1 { v16.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
"b 80f\n"
"75:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 80f\n"
"str s10, [x27, #0x0]\n"
- "str s16, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
"b 80f\n"
"76:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 78f\n"
"st1 { v8.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
"tbz x9, #1, 77f\n"
"str d9, [x27], #0x8\n"
- "str d15, [x24], #0x8\n"
+ "str d15, [x23], #0x8\n"
"tbz x9, #0, 80f\n"
"st1 { v9.s }[2], [x27]\n"
- "st1 { v15.s }[2], [x24]\n"
+ "st1 { v15.s }[2], [x23]\n"
"b 80f\n"
"77:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 80f\n"
"str s9, [x27, #0x0]\n"
- "str s15, [x24, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
"b 80f\n"
"78:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 79f\n"
"str d8, [x27], #0x8\n"
- "str d14, [x24], #0x8\n"
+ "str d14, [x23], #0x8\n"
"tbz x9, #0, 80f\n"
"st1 { v8.s }[2], [x27]\n"
- "st1 { v14.s }[2], [x24]\n"
+ "st1 { v14.s }[2], [x23]\n"
"b 80f\n"
"79:" // Height 2: Partial direct writeback: partial_1_0
"str s8, [x27, #0x0]\n"
- "str s14, [x24, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
"80:" // Height 2: Partial direct writeback: Done
"b 82f\n"
"81:" // Height 2: Full writeback
@@ -1010,38 +1008,38 @@ void a64_hybrid_fp32_mla_4x24 (
"str q12, [x27, #0x40]\n"
"str q13, [x27, #0x50]\n"
"add x27, x27, #0x60\n"
- "str q14, [x24, #0x0]\n"
- "str q15, [x24, #0x10]\n"
- "str q16, [x24, #0x20]\n"
- "str q17, [x24, #0x30]\n"
- "str q18, [x24, #0x40]\n"
- "str q19, [x24, #0x50]\n"
+ "str q14, [x23, #0x0]\n"
+ "str q15, [x23, #0x10]\n"
+ "str q16, [x23, #0x20]\n"
+ "str q17, [x23, #0x30]\n"
+ "str q18, [x23, #0x40]\n"
+ "str q19, [x23, #0x50]\n"
"82:" // Height 2: Writeback done
"subs x9, x9, #0x18\n"
"bgt 43b\n"
"b 166f\n"
"83:" // Height 3
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"84:" // Height 3: Column loop
"cbz x10, 85f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"mov v14.16b, v8.16b\n"
"mov v15.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"mov v16.16b, v10.16b\n"
"mov v17.16b, v11.16b\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v13.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
+ "add x10, x10, #0x60\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
"mov v24.16b, v12.16b\n"
@@ -1050,147 +1048,147 @@ void a64_hybrid_fp32_mla_4x24 (
"85:" // Height 3: no bias
"tbz %x[flags], #0, 99f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
"cmp x9, #0x18\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 98f\n"
"tbz x9, #4, 89f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
"tbz x9, #2, 87f\n"
"ld1 { v12.4s }, [x27], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x9, #1, 86f\n"
"ldr d13, [x27], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x9, #0, 97f\n"
"ld1 { v13.s }[2], [x27]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 97f\n"
"86:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x9, #0, 97f\n"
"ldr s13, [x27, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 97f\n"
"87:" // Height 3: Partial accumulate: partial_2_16
"tbz x9, #1, 88f\n"
"ldr d12, [x27], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x9, #0, 97f\n"
"ld1 { v12.s }[2], [x27]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 97f\n"
"88:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x9, #0, 97f\n"
"ldr s12, [x27, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"b 97f\n"
"89:" // Height 3: Partial accumulate: partial_8_0
"tbz x9, #3, 93f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"tbz x9, #2, 91f\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"tbz x9, #1, 90f\n"
"ldr d11, [x27], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
- "ldr d23, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
"tbz x9, #0, 97f\n"
"ld1 { v11.s }[2], [x27]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
"b 97f\n"
"90:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x9, #0, 97f\n"
"ldr s11, [x27, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
"b 97f\n"
"91:" // Height 3: Partial accumulate: partial_2_8
"tbz x9, #1, 92f\n"
"ldr d10, [x27], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz x9, #0, 97f\n"
"ld1 { v10.s }[2], [x27]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
"b 97f\n"
"92:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x9, #0, 97f\n"
"ldr s10, [x27, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
"b 97f\n"
"93:" // Height 3: Partial accumulate: partial_4_0
"tbz x9, #2, 95f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
"tbz x9, #1, 94f\n"
"ldr d9, [x27], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
- "ldr d21, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
"tbz x9, #0, 97f\n"
"ld1 { v9.s }[2], [x27]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
"b 97f\n"
"94:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x9, #0, 97f\n"
"ldr s9, [x27, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s21, [x23, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
"b 97f\n"
"95:" // Height 3: Partial accumulate: partial_2_0
"tbz x9, #1, 96f\n"
"ldr d8, [x27], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
- "ldr d20, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
"tbz x9, #0, 97f\n"
"ld1 { v8.s }[2], [x27]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
"b 97f\n"
"96:" // Height 3: Partial accumulate: partial_1_0
"ldr s8, [x27, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s20, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
"97:" // Height 3: Partial accumulate: Done
"sub x27, x27, x20\n"
"b 100f\n"
@@ -1201,18 +1199,18 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q11, [x27, #0x30]\n"
"ldr q12, [x27, #0x40]\n"
"ldr q13, [x27, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q20, [x23, #0x0]\n"
- "ldr q21, [x23, #0x10]\n"
- "ldr q22, [x23, #0x20]\n"
- "ldr q23, [x23, #0x30]\n"
- "ldr q24, [x23, #0x40]\n"
- "ldr q25, [x23, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x22, #0x40]\n"
+ "ldr q25, [x22, #0x50]\n"
"b 100f\n"
"99:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -1237,8 +1235,8 @@ void a64_hybrid_fp32_mla_4x24 (
"mov x26, #0x0\n"
"101:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 102f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1389,10 +1387,10 @@ void a64_hybrid_fp32_mla_4x24 (
"fmla v15.4s, v5.4s, v1.s[0]\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
"ldr q28, [x28, #0x50]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x25, x25, #0x4\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "sub x25, x25, #0x4\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
"ldr q27, [x28, #0x60]\n"
@@ -1486,20 +1484,20 @@ void a64_hybrid_fp32_mla_4x24 (
"sub x25, x25, #0x1\n"
"ldr s30, [x22], #0x4\n"
"ldr q27, [x28, #0x0]\n"
- "ldr q26, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
"fmla v8.4s, v27.4s, v0.s[0]\n"
"fmla v14.4s, v27.4s, v31.s[0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ "ldr q29, [x28, #0x20]\n"
"fmla v20.4s, v27.4s, v30.s[0]\n"
- "ldr q27, [x28, #0x40]\n"
"fmla v9.4s, v26.4s, v0.s[0]\n"
+ "ldr q28, [x28, #0x30]\n"
+ "ldr q27, [x28, #0x40]\n"
"fmla v15.4s, v26.4s, v31.s[0]\n"
"fmla v21.4s, v26.4s, v30.s[0]\n"
"ldr q26, [x28, #0x50]\n"
- "add x28, x28, #0x60\n"
"fmla v10.4s, v29.4s, v0.s[0]\n"
"fmla v16.4s, v29.4s, v31.s[0]\n"
+ "add x28, x28, #0x60\n"
"fmla v22.4s, v29.4s, v30.s[0]\n"
"fmla v11.4s, v28.4s, v0.s[0]\n"
"fmla v17.4s, v28.4s, v31.s[0]\n"
@@ -1517,15 +1515,15 @@ void a64_hybrid_fp32_mla_4x24 (
"cmp x26, x20\n"
"bne 101b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 109f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v27.4s }, [x21]\n"
"ld1r { v26.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v27.4s\n"
"fmin v9.4s, v9.4s, v27.4s\n"
@@ -1571,126 +1569,126 @@ void a64_hybrid_fp32_mla_4x24 (
"st1 { v9.4s }, [x27], #0x10\n"
"st1 { v10.4s }, [x27], #0x10\n"
"st1 { v11.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
"tbz x9, #2, 111f\n"
"st1 { v12.4s }, [x27], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x9, #1, 110f\n"
"str d13, [x27], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x9, #0, 121f\n"
"st1 { v13.s }[2], [x27]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 121f\n"
"110:" // Height 3: Partial direct writeback: partial_1_20
"tbz x9, #0, 121f\n"
"str s13, [x27, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 121f\n"
"111:" // Height 3: Partial direct writeback: partial_2_16
"tbz x9, #1, 112f\n"
"str d12, [x27], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #0, 121f\n"
"st1 { v12.s }[2], [x27]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 121f\n"
"112:" // Height 3: Partial direct writeback: partial_1_16
"tbz x9, #0, 121f\n"
"str s12, [x27, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"b 121f\n"
"113:" // Height 3: Partial direct writeback: partial_8_0
"tbz x9, #3, 117f\n"
"st1 { v8.4s }, [x27], #0x10\n"
"st1 { v9.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
"tbz x9, #2, 115f\n"
"st1 { v10.4s }, [x27], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
"tbz x9, #1, 114f\n"
"str d11, [x27], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
"tbz x9, #0, 121f\n"
"st1 { v11.s }[2], [x27]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
"b 121f\n"
"114:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 121f\n"
"str s11, [x27, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
"b 121f\n"
"115:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 116f\n"
"str d10, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
"tbz x9, #0, 121f\n"
"st1 { v10.s }[2], [x27]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
"b 121f\n"
"116:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 121f\n"
"str s10, [x27, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
"b 121f\n"
"117:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 119f\n"
"st1 { v8.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
"tbz x9, #1, 118f\n"
"str d9, [x27], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
"tbz x9, #0, 121f\n"
"st1 { v9.s }[2], [x27]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
"b 121f\n"
"118:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 121f\n"
"str s9, [x27, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
"b 121f\n"
"119:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 120f\n"
"str d8, [x27], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
"tbz x9, #0, 121f\n"
"st1 { v8.s }[2], [x27]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
"b 121f\n"
"120:" // Height 3: Partial direct writeback: partial_1_0
"str s8, [x27, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
"121:" // Height 3: Partial direct writeback: Done
"b 123f\n"
"122:" // Height 3: Full writeback
@@ -1701,48 +1699,47 @@ void a64_hybrid_fp32_mla_4x24 (
"str q12, [x27, #0x40]\n"
"str q13, [x27, #0x50]\n"
"add x27, x27, #0x60\n"
- "str q14, [x24, #0x0]\n"
- "str q15, [x24, #0x10]\n"
- "str q16, [x24, #0x20]\n"
- "str q17, [x24, #0x30]\n"
- "str q18, [x24, #0x40]\n"
- "str q19, [x24, #0x50]\n"
- "str q20, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q22, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q25, [x23, #0x50]\n"
+ "str q14, [x23, #0x0]\n"
+ "str q15, [x23, #0x10]\n"
+ "str q16, [x23, #0x20]\n"
+ "str q17, [x23, #0x30]\n"
+ "str q18, [x23, #0x40]\n"
+ "str q19, [x23, #0x50]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x22, #0x40]\n"
+ "str q25, [x22, #0x50]\n"
"123:" // Height 3: Writeback done
"subs x9, x9, #0x18\n"
"bgt 84b\n"
"b 166f\n"
"124:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x10\n"
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"125:" // Height 4: Column loop
"cbz x10, 126f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"mov v14.16b, v8.16b\n"
"mov v15.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"mov v16.16b, v10.16b\n"
"mov v17.16b, v11.16b\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v13.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
+ "add x10, x10, #0x60\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
"mov v24.16b, v12.16b\n"
@@ -1757,175 +1754,175 @@ void a64_hybrid_fp32_mla_4x24 (
"126:" // Height 4: no bias
"tbz %x[flags], #0, 140f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x9, #0x18\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x27, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x9, #0x18\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 139f\n"
"tbz x9, #4, 130f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x9, #2, 128f\n"
"ld1 { v12.4s }, [x27], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x9, #1, 127f\n"
"ldr d13, [x27], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x9, #0, 138f\n"
"ld1 { v13.s }[2], [x27]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 138f\n"
"127:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x9, #0, 138f\n"
"ldr s13, [x27, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 138f\n"
"128:" // Height 4: Partial accumulate: partial_2_16
"tbz x9, #1, 129f\n"
"ldr d12, [x27], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x9, #0, 138f\n"
"ld1 { v12.s }[2], [x27]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 138f\n"
"129:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x9, #0, 138f\n"
"ldr s12, [x27, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 138f\n"
"130:" // Height 4: Partial accumulate: partial_8_0
"tbz x9, #3, 134f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"tbz x9, #2, 132f\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x9, #1, 131f\n"
"ldr d11, [x27], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x9, #0, 138f\n"
"ld1 { v11.s }[2], [x27]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 138f\n"
"131:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x9, #0, 138f\n"
"ldr s11, [x27, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 138f\n"
"132:" // Height 4: Partial accumulate: partial_2_8
"tbz x9, #1, 133f\n"
"ldr d10, [x27], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x9, #0, 138f\n"
"ld1 { v10.s }[2], [x27]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 138f\n"
"133:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x9, #0, 138f\n"
"ldr s10, [x27, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"b 138f\n"
"134:" // Height 4: Partial accumulate: partial_4_0
"tbz x9, #2, 136f\n"
"ld1 { v8.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v20.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"tbz x9, #1, 135f\n"
"ldr d9, [x27], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
"tbz x9, #0, 138f\n"
"ld1 { v9.s }[2], [x27]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
- "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
"b 138f\n"
"135:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x9, #0, 138f\n"
"ldr s9, [x27, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s21, [x23, #0x0]\n"
- "ldr s27, [x22, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
"b 138f\n"
"136:" // Height 4: Partial accumulate: partial_2_0
"tbz x9, #1, 137f\n"
"ldr d8, [x27], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
- "ldr d20, [x23], #0x8\n"
- "ldr d26, [x22], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
"tbz x9, #0, 138f\n"
"ld1 { v8.s }[2], [x27]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v20.s }[2], [x23]\n"
- "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
"b 138f\n"
"137:" // Height 4: Partial accumulate: partial_1_0
"ldr s8, [x27, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s20, [x23, #0x0]\n"
- "ldr s26, [x22, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
"138:" // Height 4: Partial accumulate: Done
"sub x27, x27, x20\n"
"b 141f\n"
@@ -1936,24 +1933,24 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr q11, [x27, #0x30]\n"
"ldr q12, [x27, #0x40]\n"
"ldr q13, [x27, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q20, [x23, #0x0]\n"
- "ldr q21, [x23, #0x10]\n"
- "ldr q22, [x23, #0x20]\n"
- "ldr q23, [x23, #0x30]\n"
- "ldr q24, [x23, #0x40]\n"
- "ldr q25, [x23, #0x50]\n"
- "ldr q26, [x22, #0x0]\n"
- "ldr q27, [x22, #0x10]\n"
- "ldr q28, [x22, #0x20]\n"
- "ldr q29, [x22, #0x30]\n"
- "ldr q30, [x22, #0x40]\n"
- "ldr q31, [x22, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x22, #0x40]\n"
+ "ldr q25, [x22, #0x50]\n"
+ "ldr q26, [x21, #0x0]\n"
+ "ldr q27, [x21, #0x10]\n"
+ "ldr q28, [x21, #0x20]\n"
+ "ldr q29, [x21, #0x30]\n"
+ "ldr q30, [x21, #0x40]\n"
+ "ldr q31, [x21, #0x50]\n"
"b 141f\n"
"140:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -1984,8 +1981,8 @@ void a64_hybrid_fp32_mla_4x24 (
"mov x26, #0x0\n"
"142:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 143f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2292,10 +2289,10 @@ void a64_hybrid_fp32_mla_4x24 (
"ldr s4, [x21], #0x4\n"
"ldr q1, [x28, #0x0]\n"
"ldr q0, [x28, #0x10]\n"
- "ldr q3, [x28, #0x20]\n"
- "ldr q2, [x28, #0x30]\n"
"fmla v8.4s, v1.4s, v7.s[0]\n"
"fmla v14.4s, v1.4s, v6.s[0]\n"
+ "ldr q3, [x28, #0x20]\n"
+ "ldr q2, [x28, #0x30]\n"
"fmla v20.4s, v1.4s, v5.s[0]\n"
"fmla v26.4s, v1.4s, v4.s[0]\n"
"ldr q1, [x28, #0x40]\n"
@@ -2328,17 +2325,17 @@ void a64_hybrid_fp32_mla_4x24 (
"cmp x26, x20\n"
"bne 142b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 150f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v1.4s\n"
"fmin v9.4s, v9.4s, v1.4s\n"
@@ -2396,153 +2393,153 @@ void a64_hybrid_fp32_mla_4x24 (
"st1 { v9.4s }, [x27], #0x10\n"
"st1 { v10.4s }, [x27], #0x10\n"
"st1 { v11.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
- "st1 { v27.4s }, [x22], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "st1 { v27.4s }, [x21], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
"tbz x9, #2, 152f\n"
"st1 { v12.4s }, [x27], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x22], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
"tbz x9, #1, 151f\n"
"str d13, [x27], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x9, #0, 162f\n"
"st1 { v13.s }[2], [x27]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 162f\n"
"151:" // Height 4: Partial direct writeback: partial_1_20
"tbz x9, #0, 162f\n"
"str s13, [x27, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"b 162f\n"
"152:" // Height 4: Partial direct writeback: partial_2_16
"tbz x9, #1, 153f\n"
"str d12, [x27], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x9, #0, 162f\n"
"st1 { v12.s }[2], [x27]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
"b 162f\n"
"153:" // Height 4: Partial direct writeback: partial_1_16
"tbz x9, #0, 162f\n"
"str s12, [x27, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
"b 162f\n"
"154:" // Height 4: Partial direct writeback: partial_8_0
"tbz x9, #3, 158f\n"
"st1 { v8.4s }, [x27], #0x10\n"
"st1 { v9.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
- "st1 { v27.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v15.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "st1 { v27.4s }, [x21], #0x10\n"
"tbz x9, #2, 156f\n"
"st1 { v10.4s }, [x27], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
"tbz x9, #1, 155f\n"
"str d11, [x27], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x9, #0, 162f\n"
"st1 { v11.s }[2], [x27]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x22]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
"b 162f\n"
"155:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 162f\n"
"str s11, [x27, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
- "str s29, [x22, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
"b 162f\n"
"156:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 157f\n"
"str d10, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x9, #0, 162f\n"
"st1 { v10.s }[2], [x27]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x22]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
"b 162f\n"
"157:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 162f\n"
"str s10, [x27, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
- "str s28, [x22, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
"b 162f\n"
"158:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 160f\n"
"st1 { v8.4s }, [x27], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
"tbz x9, #1, 159f\n"
"str d9, [x27], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
- "str d27, [x22], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
"tbz x9, #0, 162f\n"
"st1 { v9.s }[2], [x27]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
- "st1 { v27.s }[2], [x22]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
"b 162f\n"
"159:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 162f\n"
"str s9, [x27, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
- "str s27, [x22, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
"b 162f\n"
"160:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 161f\n"
"str d8, [x27], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
- "str d26, [x22], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
"tbz x9, #0, 162f\n"
"st1 { v8.s }[2], [x27]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
- "st1 { v26.s }[2], [x22]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
"b 162f\n"
"161:" // Height 4: Partial direct writeback: partial_1_0
"str s8, [x27, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
- "str s26, [x22, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
"162:" // Height 4: Partial direct writeback: Done
"b 164f\n"
"163:" // Height 4: Full writeback
@@ -2553,24 +2550,24 @@ void a64_hybrid_fp32_mla_4x24 (
"str q12, [x27, #0x40]\n"
"str q13, [x27, #0x50]\n"
"add x27, x27, #0x60\n"
- "str q14, [x24, #0x0]\n"
- "str q15, [x24, #0x10]\n"
- "str q16, [x24, #0x20]\n"
- "str q17, [x24, #0x30]\n"
- "str q18, [x24, #0x40]\n"
- "str q19, [x24, #0x50]\n"
- "str q20, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q22, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q25, [x23, #0x50]\n"
- "str q26, [x22, #0x0]\n"
- "str q27, [x22, #0x10]\n"
- "str q28, [x22, #0x20]\n"
- "str q29, [x22, #0x30]\n"
- "str q30, [x22, #0x40]\n"
- "str q31, [x22, #0x50]\n"
+ "str q14, [x23, #0x0]\n"
+ "str q15, [x23, #0x10]\n"
+ "str q16, [x23, #0x20]\n"
+ "str q17, [x23, #0x30]\n"
+ "str q18, [x23, #0x40]\n"
+ "str q19, [x23, #0x50]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x22, #0x40]\n"
+ "str q25, [x22, #0x50]\n"
+ "str q26, [x21, #0x0]\n"
+ "str q27, [x21, #0x10]\n"
+ "str q28, [x21, #0x20]\n"
+ "str q29, [x21, #0x30]\n"
+ "str q30, [x21, #0x40]\n"
+ "str q31, [x21, #0x50]\n"
"164:" // Height 4: Writeback done
"subs x9, x9, #0x18\n"
"bgt 125b\n"
@@ -2586,8 +2583,8 @@ void a64_hybrid_fp32_mla_4x24 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"166:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index d47ab34e03..7f85d2dd42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -71,8 +71,7 @@ public:
return true;
}
- StdTransformsFixedTRB<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 1> transforms = {};
-
+ StdTransformsFixedTRB<rhs_operand_type, result_type, 6, 16, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
index a315a2fe4f..3428028ac8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp32_mla_6x16_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp32_mla_6x16_a55 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -103,10 +101,10 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"cmp %x[M], #0x2\n"
"bgt 67f\n"
"beq 34f\n"
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x7, 3f\n"
"ldr q8, [x7, #0x0]\n"
@@ -189,8 +187,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"mov x15, #0x0\n"
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -212,89 +210,86 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"18:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
"ldr d17, [x17, #0x20]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x20, [x17, #0x28]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"ldr d16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x38]\n"
- "sub x14, x14, #0x4\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "ldr x22, [x13, #0x8]\n"
- "cmp x14, #0x8\n"
"mov v16.d[1], x20\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"ldr d17, [x17, #0x40]\n"
+ "ldr x20, [x17, #0x48]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
"ldr d16, [x17, #0x50]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x58]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"mov v16.d[1], x20\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"ldr d17, [x17, #0x60]\n"
+ "ldr x20, [x17, #0x68]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
"ldr d16, [x17, #0x70]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x78]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
"mov v16.d[1], x20\n"
"fmla v10.4s, v17.4s, v0.s[1]\n"
"ldr d17, [x17, #0x80]\n"
+ "ldr x20, [x17, #0x88]\n"
"fmla v11.4s, v16.4s, v0.s[1]\n"
"ldr d16, [x17, #0x90]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0x98]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
"mov v16.d[1], x20\n"
"fmla v8.4s, v17.4s, v0.s[2]\n"
"ldr d17, [x17, #0xa0]\n"
+ "ldr x20, [x17, #0xa8]\n"
"fmla v9.4s, v16.4s, v0.s[2]\n"
"ldr d16, [x17, #0xb0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0xb8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
"mov v16.d[1], x20\n"
"fmla v10.4s, v17.4s, v0.s[2]\n"
"ldr d17, [x17, #0xc0]\n"
+ "ldr x20, [x17, #0xc8]\n"
"fmla v11.4s, v16.4s, v0.s[2]\n"
"ldr d16, [x17, #0xd0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0xd8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
"mov v16.d[1], x20\n"
"fmla v8.4s, v17.4s, v0.s[3]\n"
"ldr d17, [x17, #0xe0]\n"
+ "ldr x20, [x17, #0xe8]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr d16, [x17, #0xf0]\n"
+ "mov v17.d[1], x20\n"
"ldr x20, [x17, #0xf8]\n"
- "add x17, x17, #0x100\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x8]\n"
"mov v16.d[1], x20\n"
+ "add x13, x13, #0x10\n"
+ "add x17, x17, #0x100\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
"ldr d6, [x17, #0x0]\n"
+ "ldr x20, [x17, #0x8]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
+ "sub x14, x14, #0x4\n"
"ldr d7, [x17, #0x10]\n"
+ "cmp x14, #0x8\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
"ldr x20, [x17, #0x18]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x22\n"
+ "mov v0.d[1], x21\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"ldr q17, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
- "sub x14, x14, #0x4\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"ldr q17, [x17, #0x40]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
"ldr q16, [x17, #0x50]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"ldr q17, [x17, #0x60]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
@@ -315,23 +310,26 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr q17, [x17, #0xe0]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "add x13, x13, #0x10\n"
+ "sub x14, x14, #0x4\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
- "ldr s18, [x13], #0x4\n"
+ "ldr s17, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr q17, [x17, #0x0]\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v8.4s, v16.4s, v17.s[0]\n"
"ldr q16, [x17, #0x10]\n"
- "fmla v8.4s, v17.4s, v18.s[0]\n"
- "ldr q17, [x17, #0x20]\n"
- "fmla v9.4s, v16.4s, v18.s[0]\n"
+ "fmla v9.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x17, #0x20]\n"
+ "fmla v10.4s, v16.4s, v17.s[0]\n"
"ldr q16, [x17, #0x30]\n"
+ "fmla v11.4s, v16.4s, v17.s[0]\n"
"add x17, x17, #0x40\n"
- "fmla v10.4s, v17.4s, v18.s[0]\n"
- "fmla v11.4s, v16.4s, v18.s[0]\n"
"cbnz x14, 21b\n"
"22:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -340,14 +338,14 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"bne 15b\n"
"prfm pstl1keep, [x16, #0x0]\n"
"tbz %x[flags], #1, 23f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v16.4s\n"
+ "fmin v9.4s, v9.4s, v16.4s\n"
+ "fmin v10.4s, v10.4s, v16.4s\n"
+ "fmin v11.4s, v11.4s, v16.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v17.4s\n"
- "fmin v9.4s, v9.4s, v17.4s\n"
- "fmin v10.4s, v10.4s, v17.4s\n"
- "fmin v11.4s, v11.4s, v17.4s\n"
"fmax v8.4s, v8.4s, v16.4s\n"
"fmax v9.4s, v9.4s, v16.4s\n"
"fmax v10.4s, v10.4s, v16.4s\n"
@@ -412,96 +410,96 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"bgt 2b\n"
"b 200f\n"
"34:" // Height 2
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"35:" // Height 2: Column loop
"cbz x7, 36f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
+ "add x7, x7, #0x40\n"
"b 47f\n"
"36:" // Height 2: no bias
"tbz %x[flags], #0, 46f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x8, #0x10\n"
- "add x26, x16, x20, LSL #2\n"
+ "add x25, x16, x20, LSL #2\n"
"bge 45f\n"
"tbz x8, #3, 40f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v9.4s }, [x16], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x8, #2, 38f\n"
"ld1 { v10.4s }, [x16], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x8, #1, 37f\n"
"ldr d11, [x16], #0x8\n"
"mov x20, #0x38\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"tbz x8, #0, 44f\n"
"ld1 { v11.s }[2], [x16]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 44f\n"
"37:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x8, #0, 44f\n"
"ldr s11, [x16, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 44f\n"
"38:" // Height 2: Partial accumulate: partial_2_8
"tbz x8, #1, 39f\n"
"ldr d10, [x16], #0x8\n"
"mov x20, #0x28\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"tbz x8, #0, 44f\n"
"ld1 { v10.s }[2], [x16]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 44f\n"
"39:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x8, #0, 44f\n"
"ldr s10, [x16, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 44f\n"
"40:" // Height 2: Partial accumulate: partial_4_0
"tbz x8, #2, 42f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x8, #1, 41f\n"
"ldr d9, [x16], #0x8\n"
"mov x20, #0x18\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"tbz x8, #0, 44f\n"
"ld1 { v9.s }[2], [x16]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 44f\n"
"41:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x8, #0, 44f\n"
"ldr s9, [x16, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 44f\n"
"42:" // Height 2: Partial accumulate: partial_2_0
"tbz x8, #1, 43f\n"
"ldr d8, [x16], #0x8\n"
"mov x20, #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"tbz x8, #0, 44f\n"
"ld1 { v8.s }[2], [x16]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 44f\n"
"43:" // Height 2: Partial accumulate: partial_1_0
"ldr s8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"44:" // Height 2: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 47f\n"
@@ -510,10 +508,10 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 47f\n"
"46:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -528,8 +526,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"mov x15, #0x0\n"
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -554,98 +552,98 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"blt 52f\n"
"51:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
"ldr d17, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"ldr d16, [x17, #0x30]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x48]\n"
- "add x13, x13, #0x10\n"
- "add x12, x12, #0x10\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.4s, v17.4s, v1.s[0]\n"
"ldr d17, [x17, #0x40]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "ldr x20, [x17, #0x48]\n"
"fmla v15.4s, v16.4s, v1.s[0]\n"
"ldr d16, [x17, #0x50]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x68]\n"
- "ldr x23, [x13, #0x8]\n"
- "sub x14, x14, #0x4\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x58]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v12.4s, v17.4s, v1.s[1]\n"
"ldr d17, [x17, #0x60]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v13.4s, v16.4s, v1.s[1]\n"
"ldr d16, [x17, #0x70]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x88]\n"
- "ldr x22, [x12, #0x8]\n"
- "cmp x14, #0x8\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.4s, v17.4s, v0.s[1]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.4s, v17.4s, v1.s[1]\n"
"ldr d17, [x17, #0x80]\n"
"fmla v11.4s, v16.4s, v0.s[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "ldr x20, [x17, #0x88]\n"
"fmla v15.4s, v16.4s, v1.s[1]\n"
"ldr d16, [x17, #0x90]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xa8]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0x98]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v12.4s, v17.4s, v1.s[2]\n"
"ldr d17, [x17, #0xa0]\n"
"fmla v9.4s, v16.4s, v0.s[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v13.4s, v16.4s, v1.s[2]\n"
"ldr d16, [x17, #0xb0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xc8]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "mov v17.d[1], x21\n"
"fmla v10.4s, v17.4s, v0.s[2]\n"
+ "mov v16.d[1], x20\n"
"fmla v14.4s, v17.4s, v1.s[2]\n"
"ldr d17, [x17, #0xc0]\n"
"fmla v11.4s, v16.4s, v0.s[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "ldr x20, [x17, #0xc8]\n"
"fmla v15.4s, v16.4s, v1.s[2]\n"
"ldr d16, [x17, #0xd0]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xe8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "mov v16.d[1], x20\n"
"fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v12.4s, v17.4s, v1.s[3]\n"
"ldr d17, [x17, #0xe0]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v13.4s, v16.4s, v1.s[3]\n"
"ldr d16, [x17, #0xf0]\n"
- "mov v17.d[1], x20\n"
+ "mov v17.d[1], x21\n"
+ "add x13, x13, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"add x17, x17, #0x100\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v16.d[1], x21\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
"fmla v14.4s, v17.4s, v1.s[3]\n"
"ldr d6, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
"fmla v15.4s, v16.4s, v1.s[3]\n"
"ldr d1, [x12, #0x0]\n"
+ "sub x14, x14, #0x4\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
+ "cmp x14, #0x8\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
"ldr x20, [x17, #0x18]\n"
- "mov v0.d[1], x23\n"
- "mov v1.d[1], x22\n"
+ "mov v1.d[1], x21\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
@@ -707,8 +705,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"sub x14, x14, #0x1\n"
"ldr s18, [x12], #0x4\n"
"ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
"fmla v8.4s, v17.4s, v19.s[0]\n"
+ "ldr q16, [x17, #0x10]\n"
"fmla v12.4s, v17.4s, v18.s[0]\n"
"ldr q17, [x17, #0x20]\n"
"fmla v9.4s, v16.4s, v19.s[0]\n"
@@ -726,22 +724,22 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"cmp x15, x20\n"
"bne 48b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x16, x20, LSL #2\n"
"prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 56f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v16.4s\n"
+ "fmin v9.4s, v9.4s, v16.4s\n"
+ "fmin v10.4s, v10.4s, v16.4s\n"
+ "fmin v11.4s, v11.4s, v16.4s\n"
+ "fmin v12.4s, v12.4s, v16.4s\n"
+ "fmin v13.4s, v13.4s, v16.4s\n"
+ "fmin v14.4s, v14.4s, v16.4s\n"
+ "fmin v15.4s, v15.4s, v16.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v17.4s\n"
- "fmin v9.4s, v9.4s, v17.4s\n"
- "fmin v10.4s, v10.4s, v17.4s\n"
- "fmin v11.4s, v11.4s, v17.4s\n"
- "fmin v12.4s, v12.4s, v17.4s\n"
- "fmin v13.4s, v13.4s, v17.4s\n"
- "fmin v14.4s, v14.4s, v17.4s\n"
- "fmin v15.4s, v15.4s, v17.4s\n"
"fmax v8.4s, v8.4s, v16.4s\n"
"fmax v9.4s, v9.4s, v16.4s\n"
"fmax v10.4s, v10.4s, v16.4s\n"
@@ -756,63 +754,63 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"tbz x8, #3, 60f\n"
"st1 { v8.4s }, [x16], #0x10\n"
"st1 { v9.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
"tbz x8, #2, 58f\n"
"st1 { v10.4s }, [x16], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
"tbz x8, #1, 57f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x8, #0, 64f\n"
"st1 { v11.s }[2], [x16]\n"
- "st1 { v15.s }[2], [x26]\n"
+ "st1 { v15.s }[2], [x25]\n"
"b 64f\n"
"57:" // Height 2: Partial direct writeback: partial_1_12
"tbz x8, #0, 64f\n"
"str s11, [x16, #0x0]\n"
- "str s15, [x26, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
"b 64f\n"
"58:" // Height 2: Partial direct writeback: partial_2_8
"tbz x8, #1, 59f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x8, #0, 64f\n"
"st1 { v10.s }[2], [x16]\n"
- "st1 { v14.s }[2], [x26]\n"
+ "st1 { v14.s }[2], [x25]\n"
"b 64f\n"
"59:" // Height 2: Partial direct writeback: partial_1_8
"tbz x8, #0, 64f\n"
"str s10, [x16, #0x0]\n"
- "str s14, [x26, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
"b 64f\n"
"60:" // Height 2: Partial direct writeback: partial_4_0
"tbz x8, #2, 62f\n"
"st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
"tbz x8, #1, 61f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x8, #0, 64f\n"
"st1 { v9.s }[2], [x16]\n"
- "st1 { v13.s }[2], [x26]\n"
+ "st1 { v13.s }[2], [x25]\n"
"b 64f\n"
"61:" // Height 2: Partial direct writeback: partial_1_4
"tbz x8, #0, 64f\n"
"str s9, [x16, #0x0]\n"
- "str s13, [x26, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
"b 64f\n"
"62:" // Height 2: Partial direct writeback: partial_2_0
"tbz x8, #1, 63f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x8, #0, 64f\n"
"st1 { v8.s }[2], [x16]\n"
- "st1 { v12.s }[2], [x26]\n"
+ "st1 { v12.s }[2], [x25]\n"
"b 64f\n"
"63:" // Height 2: Partial direct writeback: partial_1_0
"str s8, [x16, #0x0]\n"
- "str s12, [x26, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
"64:" // Height 2: Partial direct writeback: Done
"b 66f\n"
"65:" // Height 2: Full writeback
@@ -821,31 +819,31 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"66:" // Height 2: Writeback done
"subs x8, x8, #0x10\n"
"bgt 35b\n"
"b 200f\n"
"67:" // Height 3
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"68:" // Height 3: Column loop
"cbz x7, 69f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -853,94 +851,94 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"69:" // Height 3: no bias
"tbz %x[flags], #0, 79f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x16, x20, LSL #2\n"
"cmp x8, #0x10\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 78f\n"
"tbz x8, #3, 73f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"ld1 { v9.4s }, [x16], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x8, #2, 71f\n"
"ld1 { v10.4s }, [x16], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x8, #1, 70f\n"
"ldr d11, [x16], #0x8\n"
"mov x20, #0x38\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x8, #0, 77f\n"
"ld1 { v11.s }[2], [x16]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 77f\n"
"70:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x8, #0, 77f\n"
"ldr s11, [x16, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 77f\n"
"71:" // Height 3: Partial accumulate: partial_2_8
"tbz x8, #1, 72f\n"
"ldr d10, [x16], #0x8\n"
"mov x20, #0x28\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x8, #0, 77f\n"
"ld1 { v10.s }[2], [x16]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 77f\n"
"72:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x8, #0, 77f\n"
"ldr s10, [x16, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 77f\n"
"73:" // Height 3: Partial accumulate: partial_4_0
"tbz x8, #2, 75f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"tbz x8, #1, 74f\n"
"ldr d9, [x16], #0x8\n"
"mov x20, #0x18\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x8, #0, 77f\n"
"ld1 { v9.s }[2], [x16]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 77f\n"
"74:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x8, #0, 77f\n"
"ldr s9, [x16, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"b 77f\n"
"75:" // Height 3: Partial accumulate: partial_2_0
"tbz x8, #1, 76f\n"
"ldr d8, [x16], #0x8\n"
"mov x20, #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x8, #0, 77f\n"
"ld1 { v8.s }[2], [x16]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
"b 77f\n"
"76:" // Height 3: Partial accumulate: partial_1_0
"ldr s8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s12, [x26, #0x0]\n"
- "ldr s16, [x25, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
"77:" // Height 3: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 80f\n"
@@ -949,14 +947,14 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 80f\n"
"79:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -975,8 +973,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"mov x15, #0x0\n"
"81:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 82f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1011,15 +1009,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v16.4s, v6.4s, v2.s[0]\n"
"ldr d21, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x13, x13, #0x10\n"
+ "mov v21.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x12, x12, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"ldr d20, [x17, #0x30]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "add x11, x11, #0x10\n"
- "ldr x24, [x13, #0x8]\n"
"mov v20.d[1], x20\n"
"fmla v10.4s, v21.4s, v0.s[0]\n"
"fmla v14.4s, v21.4s, v1.s[0]\n"
@@ -1027,15 +1021,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v18.4s, v21.4s, v2.s[0]\n"
"ldr d21, [x17, #0x40]\n"
"fmla v11.4s, v20.4s, v0.s[0]\n"
- "ldr x23, [x12, #0x8]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.4s, v20.4s, v1.s[0]\n"
- "ldr x22, [x11, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.4s, v20.4s, v2.s[0]\n"
"ldr d20, [x17, #0x50]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "sub x14, x14, #0x4\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"mov v20.d[1], x20\n"
"fmla v8.4s, v21.4s, v0.s[1]\n"
"fmla v12.4s, v21.4s, v1.s[1]\n"
@@ -1043,14 +1033,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v16.4s, v21.4s, v2.s[1]\n"
"ldr d21, [x17, #0x60]\n"
"fmla v9.4s, v20.4s, v0.s[1]\n"
- "cmp x14, #0x8\n"
+ "mov v21.d[1], x21\n"
"fmla v13.4s, v20.4s, v1.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.4s, v20.4s, v2.s[1]\n"
"ldr d20, [x17, #0x70]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
"mov v20.d[1], x20\n"
"fmla v10.4s, v21.4s, v0.s[1]\n"
"fmla v14.4s, v21.4s, v1.s[1]\n"
@@ -1058,11 +1045,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v18.4s, v21.4s, v2.s[1]\n"
"ldr d21, [x17, #0x80]\n"
"fmla v11.4s, v20.4s, v0.s[1]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.4s, v20.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.4s, v20.4s, v2.s[1]\n"
"ldr d20, [x17, #0x90]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
"mov v20.d[1], x20\n"
"fmla v8.4s, v21.4s, v0.s[2]\n"
"fmla v12.4s, v21.4s, v1.s[2]\n"
@@ -1070,11 +1057,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v16.4s, v21.4s, v2.s[2]\n"
"ldr d21, [x17, #0xa0]\n"
"fmla v9.4s, v20.4s, v0.s[2]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.4s, v20.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.4s, v20.4s, v2.s[2]\n"
"ldr d20, [x17, #0xb0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
"mov v20.d[1], x20\n"
"fmla v10.4s, v21.4s, v0.s[2]\n"
"fmla v14.4s, v21.4s, v1.s[2]\n"
@@ -1082,11 +1069,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v18.4s, v21.4s, v2.s[2]\n"
"ldr d21, [x17, #0xc0]\n"
"fmla v11.4s, v20.4s, v0.s[2]\n"
+ "mov v21.d[1], x21\n"
"fmla v15.4s, v20.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.4s, v20.4s, v2.s[2]\n"
"ldr d20, [x17, #0xd0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
"mov v20.d[1], x20\n"
"fmla v8.4s, v21.4s, v0.s[3]\n"
"fmla v12.4s, v21.4s, v1.s[3]\n"
@@ -1094,29 +1081,40 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v16.4s, v21.4s, v2.s[3]\n"
"ldr d21, [x17, #0xe0]\n"
"fmla v9.4s, v20.4s, v0.s[3]\n"
+ "mov v21.d[1], x21\n"
"fmla v13.4s, v20.4s, v1.s[3]\n"
+ "add x13, x13, #0x10\n"
"fmla v17.4s, v20.4s, v2.s[3]\n"
"ldr d20, [x17, #0xf0]\n"
- "mov v21.d[1], x21\n"
- "add x17, x17, #0x100\n"
- "ldr x21, [x17, #0x8]\n"
"mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x17, x17, #0x100\n"
"fmla v10.4s, v21.4s, v0.s[3]\n"
+ "ldr x20, [x17, #0x8]\n"
"fmla v14.4s, v21.4s, v1.s[3]\n"
- "ldr x20, [x17, #0x18]\n"
+ "ldr x23, [x13, #0x8]\n"
"fmla v18.4s, v21.4s, v2.s[3]\n"
"ldr d6, [x17, #0x0]\n"
"fmla v11.4s, v20.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
"fmla v15.4s, v20.4s, v1.s[3]\n"
"ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
"fmla v19.4s, v20.4s, v2.s[3]\n"
"ldr d2, [x11, #0x0]\n"
+ "sub x14, x14, #0x4\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x24\n"
- "mov v1.d[1], x23\n"
- "mov v2.d[1], x22\n"
+ "cmp x14, #0x8\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v0.d[1], x23\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"mov v7.d[1], x20\n"
"bge 84b\n"
"85:" // Height 3: Multiply loop: Single iteration only
@@ -1198,8 +1196,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr s23, [x12], #0x4\n"
"ldr s22, [x11], #0x4\n"
"ldr q21, [x17, #0x0]\n"
- "ldr q20, [x17, #0x10]\n"
"fmla v8.4s, v21.4s, v24.s[0]\n"
+ "ldr q20, [x17, #0x10]\n"
"fmla v12.4s, v21.4s, v23.s[0]\n"
"fmla v16.4s, v21.4s, v22.s[0]\n"
"ldr q21, [x17, #0x20]\n"
@@ -1221,28 +1219,28 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"cmp x15, x20\n"
"bne 81b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x16, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 89f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v20.4s\n"
+ "fmin v9.4s, v9.4s, v20.4s\n"
+ "fmin v10.4s, v10.4s, v20.4s\n"
+ "fmin v11.4s, v11.4s, v20.4s\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.4s }, [x21]\n"
"ld1r { v20.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v21.4s\n"
- "fmin v9.4s, v9.4s, v21.4s\n"
- "fmin v10.4s, v10.4s, v21.4s\n"
- "fmin v11.4s, v11.4s, v21.4s\n"
- "fmin v12.4s, v12.4s, v21.4s\n"
- "fmin v13.4s, v13.4s, v21.4s\n"
- "fmin v14.4s, v14.4s, v21.4s\n"
- "fmin v15.4s, v15.4s, v21.4s\n"
- "fmin v16.4s, v16.4s, v21.4s\n"
- "fmin v17.4s, v17.4s, v21.4s\n"
- "fmin v18.4s, v18.4s, v21.4s\n"
- "fmin v19.4s, v19.4s, v21.4s\n"
"fmax v8.4s, v8.4s, v20.4s\n"
"fmax v9.4s, v9.4s, v20.4s\n"
"fmax v10.4s, v10.4s, v20.4s\n"
@@ -1261,79 +1259,79 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"tbz x8, #3, 93f\n"
"st1 { v8.4s }, [x16], #0x10\n"
"st1 { v9.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x8, #2, 91f\n"
"st1 { v10.4s }, [x16], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x8, #1, 90f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x8, #0, 97f\n"
"st1 { v11.s }[2], [x16]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 97f\n"
"90:" // Height 3: Partial direct writeback: partial_1_12
"tbz x8, #0, 97f\n"
"str s11, [x16, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 97f\n"
"91:" // Height 3: Partial direct writeback: partial_2_8
"tbz x8, #1, 92f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x8, #0, 97f\n"
"st1 { v10.s }[2], [x16]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 97f\n"
"92:" // Height 3: Partial direct writeback: partial_1_8
"tbz x8, #0, 97f\n"
"str s10, [x16, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 97f\n"
"93:" // Height 3: Partial direct writeback: partial_4_0
"tbz x8, #2, 95f\n"
"st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x8, #1, 94f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x8, #0, 97f\n"
"st1 { v9.s }[2], [x16]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 97f\n"
"94:" // Height 3: Partial direct writeback: partial_1_4
"tbz x8, #0, 97f\n"
"str s9, [x16, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 97f\n"
"95:" // Height 3: Partial direct writeback: partial_2_0
"tbz x8, #1, 96f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x8, #0, 97f\n"
"st1 { v8.s }[2], [x16]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 97f\n"
"96:" // Height 3: Partial direct writeback: partial_1_0
"str s8, [x16, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"97:" // Height 3: Partial direct writeback: Done
"b 99f\n"
"98:" // Height 3: Full writeback
@@ -1342,35 +1340,35 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"99:" // Height 3: Writeback done
"subs x8, x8, #0x10\n"
"bgt 68b\n"
"b 200f\n"
"100:" // Height 4
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"101:" // Height 4: Column loop
"cbz x7, 102f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -1382,111 +1380,111 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"102:" // Height 4: no bias
"tbz %x[flags], #0, 112f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x16, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 111f\n"
"tbz x8, #3, 106f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x16], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x8, #2, 104f\n"
"ld1 { v10.4s }, [x16], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x8, #1, 103f\n"
"ldr d11, [x16], #0x8\n"
"mov x20, #0x38\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x8, #0, 110f\n"
"ld1 { v11.s }[2], [x16]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 110f\n"
"103:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x8, #0, 110f\n"
"ldr s11, [x16, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 110f\n"
"104:" // Height 4: Partial accumulate: partial_2_8
"tbz x8, #1, 105f\n"
"ldr d10, [x16], #0x8\n"
"mov x20, #0x28\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x8, #0, 110f\n"
"ld1 { v10.s }[2], [x16]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 110f\n"
"105:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x8, #0, 110f\n"
"ldr s10, [x16, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 110f\n"
"106:" // Height 4: Partial accumulate: partial_4_0
"tbz x8, #2, 108f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x8, #1, 107f\n"
"ldr d9, [x16], #0x8\n"
"mov x20, #0x18\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x8, #0, 110f\n"
"ld1 { v9.s }[2], [x16]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 110f\n"
"107:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x8, #0, 110f\n"
"ldr s9, [x16, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 110f\n"
"108:" // Height 4: Partial accumulate: partial_2_0
"tbz x8, #1, 109f\n"
"ldr d8, [x16], #0x8\n"
"mov x20, #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x8, #0, 110f\n"
"ld1 { v8.s }[2], [x16]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 110f\n"
"109:" // Height 4: Partial accumulate: partial_1_0
"ldr s8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s12, [x26, #0x0]\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"110:" // Height 4: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 113f\n"
@@ -1495,18 +1493,18 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 113f\n"
"112:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -1529,8 +1527,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"mov x15, #0x0\n"
"114:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1563,121 +1561,122 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"blt 118f\n"
"117:" // Height 4: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
"ldr d25, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x12, x12, #0x10\n"
+ "mov v25.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "mov v25.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"ldr d24, [x17, #0x30]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.4s, v25.4s, v0.s[0]\n"
- "ldr x20, [x17, #0x48]\n"
"fmla v14.4s, v25.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.4s, v25.4s, v2.s[0]\n"
+ "add x11, x11, #0x10\n"
"fmla v22.4s, v25.4s, v3.s[0]\n"
"ldr d25, [x17, #0x40]\n"
"fmla v11.4s, v24.4s, v0.s[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.4s, v24.4s, v1.s[0]\n"
- "ldr x25, [x13, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.4s, v24.4s, v2.s[0]\n"
- "mov v25.d[1], x20\n"
+ "add x10, x10, #0x10\n"
"fmla v23.4s, v24.4s, v3.s[0]\n"
"ldr d24, [x17, #0x50]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.4s, v25.4s, v0.s[1]\n"
- "ldr x20, [x17, #0x68]\n"
"fmla v12.4s, v25.4s, v1.s[1]\n"
- "ldr x24, [x12, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.4s, v25.4s, v2.s[1]\n"
+ "ldr x25, [x13, #0x8]\n"
"fmla v20.4s, v25.4s, v3.s[1]\n"
"ldr d25, [x17, #0x60]\n"
"fmla v9.4s, v24.4s, v0.s[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.4s, v24.4s, v1.s[1]\n"
- "ldr x23, [x11, #0x8]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.4s, v24.4s, v2.s[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x24, [x12, #0x8]\n"
"fmla v21.4s, v24.4s, v3.s[1]\n"
"ldr d24, [x17, #0x70]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.4s, v25.4s, v0.s[1]\n"
- "ldr x20, [x17, #0x88]\n"
"fmla v14.4s, v25.4s, v1.s[1]\n"
- "ldr x22, [x10, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.4s, v25.4s, v2.s[1]\n"
+ "ldr x23, [x11, #0x8]\n"
"fmla v22.4s, v25.4s, v3.s[1]\n"
"ldr d25, [x17, #0x80]\n"
"fmla v11.4s, v24.4s, v0.s[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.4s, v24.4s, v1.s[1]\n"
- "sub x14, x14, #0x4\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.4s, v24.4s, v2.s[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x22, [x10, #0x8]\n"
"fmla v23.4s, v24.4s, v3.s[1]\n"
"ldr d24, [x17, #0x90]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.4s, v25.4s, v0.s[2]\n"
- "ldr x20, [x17, #0xa8]\n"
"fmla v12.4s, v25.4s, v1.s[2]\n"
- "cmp x14, #0x8\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.4s, v25.4s, v2.s[2]\n"
+ "sub x14, x14, #0x4\n"
"fmla v20.4s, v25.4s, v3.s[2]\n"
"ldr d25, [x17, #0xa0]\n"
"fmla v9.4s, v24.4s, v0.s[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.4s, v24.4s, v1.s[2]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.4s, v24.4s, v2.s[2]\n"
- "mov v25.d[1], x20\n"
+ "cmp x14, #0x8\n"
"fmla v21.4s, v24.4s, v3.s[2]\n"
"ldr d24, [x17, #0xb0]\n"
+ "mov v24.d[1], x20\n"
"fmla v10.4s, v25.4s, v0.s[2]\n"
- "ldr x20, [x17, #0xc8]\n"
"fmla v14.4s, v25.4s, v1.s[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.4s, v25.4s, v2.s[2]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v22.4s, v25.4s, v3.s[2]\n"
"ldr d25, [x17, #0xc0]\n"
"fmla v11.4s, v24.4s, v0.s[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "mov v25.d[1], x21\n"
"fmla v15.4s, v24.4s, v1.s[2]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.4s, v24.4s, v2.s[2]\n"
- "mov v25.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v23.4s, v24.4s, v3.s[2]\n"
"ldr d24, [x17, #0xd0]\n"
+ "mov v24.d[1], x20\n"
"fmla v8.4s, v25.4s, v0.s[3]\n"
- "ldr x20, [x17, #0xe8]\n"
"fmla v12.4s, v25.4s, v1.s[3]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.4s, v25.4s, v2.s[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v20.4s, v25.4s, v3.s[3]\n"
"ldr d25, [x17, #0xe0]\n"
"fmla v9.4s, v24.4s, v0.s[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "mov v25.d[1], x21\n"
"fmla v13.4s, v24.4s, v1.s[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v17.4s, v24.4s, v2.s[3]\n"
- "mov v25.d[1], x20\n"
"fmla v21.4s, v24.4s, v3.s[3]\n"
"ldr d24, [x17, #0xf0]\n"
+ "mov v24.d[1], x20\n"
"add x17, x17, #0x100\n"
"fmla v10.4s, v25.4s, v0.s[3]\n"
+ "ldr x21, [x17, #0x8]\n"
"fmla v14.4s, v25.4s, v1.s[3]\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.4s, v25.4s, v2.s[3]\n"
"fmla v22.4s, v25.4s, v3.s[3]\n"
"ldr d6, [x17, #0x0]\n"
@@ -1690,8 +1689,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v23.4s, v24.4s, v3.s[3]\n"
"ldr d3, [x10, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x17, #0x18]\n"
+ "mov v6.d[1], x21\n"
"mov v0.d[1], x25\n"
"mov v1.d[1], x24\n"
"mov v2.d[1], x23\n"
@@ -1796,8 +1794,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr s27, [x11], #0x4\n"
"ldr s26, [x10], #0x4\n"
"ldr q25, [x17, #0x0]\n"
- "ldr q24, [x17, #0x10]\n"
"fmla v8.4s, v25.4s, v29.s[0]\n"
+ "ldr q24, [x17, #0x10]\n"
"fmla v12.4s, v25.4s, v28.s[0]\n"
"fmla v16.4s, v25.4s, v27.s[0]\n"
"fmla v20.4s, v25.4s, v26.s[0]\n"
@@ -1823,34 +1821,34 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"cmp x15, x20\n"
"bne 114b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x16, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
+ "prfm pstl1keep, [x16, #0x0]\n"
"prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v24.4s\n"
+ "fmin v9.4s, v9.4s, v24.4s\n"
+ "fmin v10.4s, v10.4s, v24.4s\n"
+ "fmin v11.4s, v11.4s, v24.4s\n"
+ "fmin v12.4s, v12.4s, v24.4s\n"
+ "fmin v13.4s, v13.4s, v24.4s\n"
+ "fmin v14.4s, v14.4s, v24.4s\n"
+ "fmin v15.4s, v15.4s, v24.4s\n"
+ "fmin v16.4s, v16.4s, v24.4s\n"
+ "fmin v17.4s, v17.4s, v24.4s\n"
+ "fmin v18.4s, v18.4s, v24.4s\n"
+ "fmin v19.4s, v19.4s, v24.4s\n"
+ "fmin v20.4s, v20.4s, v24.4s\n"
+ "fmin v21.4s, v21.4s, v24.4s\n"
+ "fmin v22.4s, v22.4s, v24.4s\n"
+ "fmin v23.4s, v23.4s, v24.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.4s }, [x21]\n"
"ld1r { v24.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v25.4s\n"
- "fmin v9.4s, v9.4s, v25.4s\n"
- "fmin v10.4s, v10.4s, v25.4s\n"
- "fmin v11.4s, v11.4s, v25.4s\n"
- "fmin v12.4s, v12.4s, v25.4s\n"
- "fmin v13.4s, v13.4s, v25.4s\n"
- "fmin v14.4s, v14.4s, v25.4s\n"
- "fmin v15.4s, v15.4s, v25.4s\n"
- "fmin v16.4s, v16.4s, v25.4s\n"
- "fmin v17.4s, v17.4s, v25.4s\n"
- "fmin v18.4s, v18.4s, v25.4s\n"
- "fmin v19.4s, v19.4s, v25.4s\n"
- "fmin v20.4s, v20.4s, v25.4s\n"
- "fmin v21.4s, v21.4s, v25.4s\n"
- "fmin v22.4s, v22.4s, v25.4s\n"
- "fmin v23.4s, v23.4s, v25.4s\n"
"fmax v8.4s, v8.4s, v24.4s\n"
"fmax v9.4s, v9.4s, v24.4s\n"
"fmax v10.4s, v10.4s, v24.4s\n"
@@ -1873,95 +1871,95 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"tbz x8, #3, 126f\n"
"st1 { v8.4s }, [x16], #0x10\n"
"st1 { v9.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
"tbz x8, #2, 124f\n"
"st1 { v10.4s }, [x16], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
"tbz x8, #1, 123f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x8, #0, 130f\n"
"st1 { v11.s }[2], [x16]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
"b 130f\n"
"123:" // Height 4: Partial direct writeback: partial_1_12
"tbz x8, #0, 130f\n"
"str s11, [x16, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
"b 130f\n"
"124:" // Height 4: Partial direct writeback: partial_2_8
"tbz x8, #1, 125f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x8, #0, 130f\n"
"st1 { v10.s }[2], [x16]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
"b 130f\n"
"125:" // Height 4: Partial direct writeback: partial_1_8
"tbz x8, #0, 130f\n"
"str s10, [x16, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
"b 130f\n"
"126:" // Height 4: Partial direct writeback: partial_4_0
"tbz x8, #2, 128f\n"
"st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
"tbz x8, #1, 127f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x8, #0, 130f\n"
"st1 { v9.s }[2], [x16]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
"b 130f\n"
"127:" // Height 4: Partial direct writeback: partial_1_4
"tbz x8, #0, 130f\n"
"str s9, [x16, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
"b 130f\n"
"128:" // Height 4: Partial direct writeback: partial_2_0
"tbz x8, #1, 129f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x8, #0, 130f\n"
"st1 { v8.s }[2], [x16]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
"b 130f\n"
"129:" // Height 4: Partial direct writeback: partial_1_0
"str s8, [x16, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
"130:" // Height 4: Partial direct writeback: Done
"b 132f\n"
"131:" // Height 4: Full writeback
@@ -1970,39 +1968,39 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"132:" // Height 4: Writeback done
"subs x8, x8, #0x10\n"
"bgt 101b\n"
"b 200f\n"
"133:" // Height 5
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
"134:" // Height 5: Column loop
"cbz x7, 135f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -2018,128 +2016,128 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"135:" // Height 5: no bias
"tbz %x[flags], #0, 145f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x16, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 144f\n"
"tbz x8, #3, 139f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x16], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x8, #2, 137f\n"
"ld1 { v10.4s }, [x16], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x8, #1, 136f\n"
"ldr d11, [x16], #0x8\n"
"mov x20, #0x38\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x8, #0, 143f\n"
"ld1 { v11.s }[2], [x16]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 143f\n"
"136:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x8, #0, 143f\n"
"ldr s11, [x16, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 143f\n"
"137:" // Height 5: Partial accumulate: partial_2_8
"tbz x8, #1, 138f\n"
"ldr d10, [x16], #0x8\n"
"mov x20, #0x28\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x8, #0, 143f\n"
"ld1 { v10.s }[2], [x16]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 143f\n"
"138:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x8, #0, 143f\n"
"ldr s10, [x16, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 143f\n"
"139:" // Height 5: Partial accumulate: partial_4_0
"tbz x8, #2, 141f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x8, #1, 140f\n"
"ldr d9, [x16], #0x8\n"
"mov x20, #0x18\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x8, #0, 143f\n"
"ld1 { v9.s }[2], [x16]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 143f\n"
"140:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x8, #0, 143f\n"
"ldr s9, [x16, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 143f\n"
"141:" // Height 5: Partial accumulate: partial_2_0
"tbz x8, #1, 142f\n"
"ldr d8, [x16], #0x8\n"
"mov x20, #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x8, #0, 143f\n"
"ld1 { v8.s }[2], [x16]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 143f\n"
"142:" // Height 5: Partial accumulate: partial_1_0
"ldr s8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s12, [x26, #0x0]\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"143:" // Height 5: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 146f\n"
@@ -2148,22 +2146,22 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 146f\n"
"145:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -2190,8 +2188,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"mov x15, #0x0\n"
"147:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 148f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2238,131 +2236,131 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v24.4s, v6.4s, v4.s[0]\n"
"ldr d29, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x11, x11, #0x10\n"
+ "mov v29.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "mov v29.d[1], x21\n"
+ "add x11, x11, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x10, x10, #0x10\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"ldr d28, [x17, #0x30]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.4s, v29.4s, v0.s[0]\n"
- "add x9, x9, #0x10\n"
"fmla v14.4s, v29.4s, v1.s[0]\n"
- "ldr x26, [x13, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.4s, v29.4s, v2.s[0]\n"
+ "add x9, x9, #0x10\n"
"fmla v22.4s, v29.4s, v3.s[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x13, #0x8]\n"
"fmla v26.4s, v29.4s, v4.s[0]\n"
"ldr d29, [x17, #0x40]\n"
"fmla v11.4s, v28.4s, v0.s[0]\n"
- "ldr x25, [x12, #0x8]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.4s, v28.4s, v1.s[0]\n"
- "ldr x24, [x11, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.4s, v28.4s, v2.s[0]\n"
- "mov v29.d[1], x21\n"
+ "ldr x25, [x12, #0x8]\n"
"fmla v23.4s, v28.4s, v3.s[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x24, [x11, #0x8]\n"
"fmla v27.4s, v28.4s, v4.s[0]\n"
"ldr d28, [x17, #0x50]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.4s, v29.4s, v0.s[1]\n"
- "ldr x23, [x10, #0x8]\n"
"fmla v12.4s, v29.4s, v1.s[1]\n"
- "ldr x22, [x9, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.4s, v29.4s, v2.s[1]\n"
+ "ldr x23, [x10, #0x8]\n"
"fmla v20.4s, v29.4s, v3.s[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "ldr x22, [x9, #0x8]\n"
"fmla v24.4s, v29.4s, v4.s[1]\n"
"ldr d29, [x17, #0x60]\n"
"fmla v9.4s, v28.4s, v0.s[1]\n"
- "sub x14, x14, #0x4\n"
+ "mov v29.d[1], x21\n"
"fmla v13.4s, v28.4s, v1.s[1]\n"
- "cmp x14, #0x8\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.4s, v28.4s, v2.s[1]\n"
- "mov v29.d[1], x21\n"
+ "sub x14, x14, #0x4\n"
"fmla v21.4s, v28.4s, v3.s[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "cmp x14, #0x8\n"
"fmla v25.4s, v28.4s, v4.s[1]\n"
"ldr d28, [x17, #0x70]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.4s, v29.4s, v0.s[1]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
"fmla v14.4s, v29.4s, v1.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.4s, v29.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v22.4s, v29.4s, v3.s[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v26.4s, v29.4s, v4.s[1]\n"
"ldr d29, [x17, #0x80]\n"
"fmla v11.4s, v28.4s, v0.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.4s, v28.4s, v1.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.4s, v28.4s, v2.s[1]\n"
- "mov v29.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v23.4s, v28.4s, v3.s[1]\n"
- "ldr x21, [x17, #0xa8]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v27.4s, v28.4s, v4.s[1]\n"
"ldr d28, [x17, #0x90]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.4s, v29.4s, v0.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
"fmla v12.4s, v29.4s, v1.s[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.4s, v29.4s, v2.s[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v20.4s, v29.4s, v3.s[2]\n"
- "ldr x20, [x17, #0xb8]\n"
"fmla v24.4s, v29.4s, v4.s[2]\n"
"ldr d29, [x17, #0xa0]\n"
"fmla v9.4s, v28.4s, v0.s[2]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.4s, v28.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.4s, v28.4s, v2.s[2]\n"
- "mov v29.d[1], x21\n"
"fmla v21.4s, v28.4s, v3.s[2]\n"
- "ldr x21, [x17, #0xc8]\n"
"fmla v25.4s, v28.4s, v4.s[2]\n"
"ldr d28, [x17, #0xb0]\n"
+ "mov v28.d[1], x20\n"
"fmla v10.4s, v29.4s, v0.s[2]\n"
"fmla v14.4s, v29.4s, v1.s[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.4s, v29.4s, v2.s[2]\n"
"fmla v22.4s, v29.4s, v3.s[2]\n"
- "ldr x20, [x17, #0xd8]\n"
"fmla v26.4s, v29.4s, v4.s[2]\n"
"ldr d29, [x17, #0xc0]\n"
"fmla v11.4s, v28.4s, v0.s[2]\n"
+ "mov v29.d[1], x21\n"
"fmla v15.4s, v28.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.4s, v28.4s, v2.s[2]\n"
- "mov v29.d[1], x21\n"
"fmla v23.4s, v28.4s, v3.s[2]\n"
- "ldr x21, [x17, #0xe8]\n"
"fmla v27.4s, v28.4s, v4.s[2]\n"
"ldr d28, [x17, #0xd0]\n"
+ "mov v28.d[1], x20\n"
"fmla v8.4s, v29.4s, v0.s[3]\n"
"fmla v12.4s, v29.4s, v1.s[3]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.4s, v29.4s, v2.s[3]\n"
"fmla v20.4s, v29.4s, v3.s[3]\n"
- "ldr x20, [x17, #0xf8]\n"
"fmla v24.4s, v29.4s, v4.s[3]\n"
"ldr d29, [x17, #0xe0]\n"
"fmla v9.4s, v28.4s, v0.s[3]\n"
+ "mov v29.d[1], x21\n"
"fmla v13.4s, v28.4s, v1.s[3]\n"
"fmla v17.4s, v28.4s, v2.s[3]\n"
- "mov v29.d[1], x21\n"
"fmla v21.4s, v28.4s, v3.s[3]\n"
"fmla v25.4s, v28.4s, v4.s[3]\n"
"ldr d28, [x17, #0xf0]\n"
+ "mov v28.d[1], x20\n"
"add x17, x17, #0x100\n"
"fmla v10.4s, v29.4s, v0.s[3]\n"
- "fmla v14.4s, v29.4s, v1.s[3]\n"
"ldr x21, [x17, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.4s, v29.4s, v2.s[3]\n"
"fmla v22.4s, v29.4s, v3.s[3]\n"
- "ldr x20, [x17, #0x18]\n"
"fmla v26.4s, v29.4s, v4.s[3]\n"
"ldr d6, [x17, #0x0]\n"
"fmla v11.4s, v28.4s, v0.s[3]\n"
@@ -2501,8 +2499,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr s31, [x10], #0x4\n"
"ldr s30, [x9], #0x4\n"
"ldr q29, [x17, #0x0]\n"
- "ldr q28, [x17, #0x10]\n"
"fmla v8.4s, v29.4s, v2.s[0]\n"
+ "ldr q28, [x17, #0x10]\n"
"fmla v12.4s, v29.4s, v1.s[0]\n"
"fmla v16.4s, v29.4s, v0.s[0]\n"
"fmla v20.4s, v29.4s, v31.s[0]\n"
@@ -2532,40 +2530,40 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"cmp x15, x20\n"
"bne 147b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x16, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
+ "prfm pstl1keep, [x16, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 155f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v28.4s\n"
+ "fmin v9.4s, v9.4s, v28.4s\n"
+ "fmin v10.4s, v10.4s, v28.4s\n"
+ "fmin v11.4s, v11.4s, v28.4s\n"
+ "fmin v12.4s, v12.4s, v28.4s\n"
+ "fmin v13.4s, v13.4s, v28.4s\n"
+ "fmin v14.4s, v14.4s, v28.4s\n"
+ "fmin v15.4s, v15.4s, v28.4s\n"
+ "fmin v16.4s, v16.4s, v28.4s\n"
+ "fmin v17.4s, v17.4s, v28.4s\n"
+ "fmin v18.4s, v18.4s, v28.4s\n"
+ "fmin v19.4s, v19.4s, v28.4s\n"
+ "fmin v20.4s, v20.4s, v28.4s\n"
+ "fmin v21.4s, v21.4s, v28.4s\n"
+ "fmin v22.4s, v22.4s, v28.4s\n"
+ "fmin v23.4s, v23.4s, v28.4s\n"
+ "fmin v24.4s, v24.4s, v28.4s\n"
+ "fmin v25.4s, v25.4s, v28.4s\n"
+ "fmin v26.4s, v26.4s, v28.4s\n"
+ "fmin v27.4s, v27.4s, v28.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.4s }, [x21]\n"
"ld1r { v28.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v29.4s\n"
- "fmin v9.4s, v9.4s, v29.4s\n"
- "fmin v10.4s, v10.4s, v29.4s\n"
- "fmin v11.4s, v11.4s, v29.4s\n"
- "fmin v12.4s, v12.4s, v29.4s\n"
- "fmin v13.4s, v13.4s, v29.4s\n"
- "fmin v14.4s, v14.4s, v29.4s\n"
- "fmin v15.4s, v15.4s, v29.4s\n"
- "fmin v16.4s, v16.4s, v29.4s\n"
- "fmin v17.4s, v17.4s, v29.4s\n"
- "fmin v18.4s, v18.4s, v29.4s\n"
- "fmin v19.4s, v19.4s, v29.4s\n"
- "fmin v20.4s, v20.4s, v29.4s\n"
- "fmin v21.4s, v21.4s, v29.4s\n"
- "fmin v22.4s, v22.4s, v29.4s\n"
- "fmin v23.4s, v23.4s, v29.4s\n"
- "fmin v24.4s, v24.4s, v29.4s\n"
- "fmin v25.4s, v25.4s, v29.4s\n"
- "fmin v26.4s, v26.4s, v29.4s\n"
- "fmin v27.4s, v27.4s, v29.4s\n"
"fmax v8.4s, v8.4s, v28.4s\n"
"fmax v9.4s, v9.4s, v28.4s\n"
"fmax v10.4s, v10.4s, v28.4s\n"
@@ -2592,111 +2590,111 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"tbz x8, #3, 159f\n"
"st1 { v8.4s }, [x16], #0x10\n"
"st1 { v9.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x8, #2, 157f\n"
"st1 { v10.4s }, [x16], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x8, #1, 156f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x8, #0, 163f\n"
"st1 { v11.s }[2], [x16]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 163f\n"
"156:" // Height 5: Partial direct writeback: partial_1_12
"tbz x8, #0, 163f\n"
"str s11, [x16, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 163f\n"
"157:" // Height 5: Partial direct writeback: partial_2_8
"tbz x8, #1, 158f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x8, #0, 163f\n"
"st1 { v10.s }[2], [x16]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 163f\n"
"158:" // Height 5: Partial direct writeback: partial_1_8
"tbz x8, #0, 163f\n"
"str s10, [x16, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 163f\n"
"159:" // Height 5: Partial direct writeback: partial_4_0
"tbz x8, #2, 161f\n"
"st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x8, #1, 160f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x8, #0, 163f\n"
"st1 { v9.s }[2], [x16]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 163f\n"
"160:" // Height 5: Partial direct writeback: partial_1_4
"tbz x8, #0, 163f\n"
"str s9, [x16, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 163f\n"
"161:" // Height 5: Partial direct writeback: partial_2_0
"tbz x8, #1, 162f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x8, #0, 163f\n"
"st1 { v8.s }[2], [x16]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 163f\n"
"162:" // Height 5: Partial direct writeback: partial_1_0
"str s8, [x16, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"163:" // Height 5: Partial direct writeback: Done
"b 165f\n"
"164:" // Height 5: Full writeback
@@ -2705,22 +2703,22 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"165:" // Height 5: Writeback done
"subs x8, x8, #0x10\n"
"bgt 134b\n"
@@ -2728,24 +2726,23 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"166:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x18\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x7, %x[bias]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "madd x20, x21, x20, x16\n"
"ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x16, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"167:" // Height 6: Column loop
"cbz x7, 168f\n"
"ldr q8, [x7, #0x0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
"mov v12.16b, v8.16b\n"
+ "ldr q9, [x7, #0x10]\n"
"mov v13.16b, v9.16b\n"
- "add x7, x7, #0x40\n"
+ "ldr q10, [x7, #0x20]\n"
"mov v14.16b, v10.16b\n"
+ "ldr q11, [x7, #0x30]\n"
"mov v15.16b, v11.16b\n"
"mov v16.16b, v8.16b\n"
+ "add x7, x7, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
@@ -2765,145 +2762,145 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"168:" // Height 6: no bias
"tbz %x[flags], #0, 178f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x16, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 177f\n"
"tbz x8, #3, 172f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x16], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x8, #2, 170f\n"
"ld1 { v10.4s }, [x16], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x8, #1, 169f\n"
"ldr d11, [x16], #0x8\n"
"mov x20, #0x38\n"
- "ldr d15, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x8, #0, 176f\n"
"ld1 { v11.s }[2], [x16]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 176f\n"
"169:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x8, #0, 176f\n"
"ldr s11, [x16, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 176f\n"
"170:" // Height 6: Partial accumulate: partial_2_8
"tbz x8, #1, 171f\n"
"ldr d10, [x16], #0x8\n"
"mov x20, #0x28\n"
- "ldr d14, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x8, #0, 176f\n"
"ld1 { v10.s }[2], [x16]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 176f\n"
"171:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x8, #0, 176f\n"
"ldr s10, [x16, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 176f\n"
"172:" // Height 6: Partial accumulate: partial_4_0
"tbz x8, #2, 174f\n"
"ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x8, #1, 173f\n"
"ldr d9, [x16], #0x8\n"
"mov x20, #0x18\n"
- "ldr d13, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x8, #0, 176f\n"
"ld1 { v9.s }[2], [x16]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 176f\n"
"173:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x8, #0, 176f\n"
"ldr s9, [x16, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 176f\n"
"174:" // Height 6: Partial accumulate: partial_2_0
"tbz x8, #1, 175f\n"
"ldr d8, [x16], #0x8\n"
"mov x20, #0x8\n"
- "ldr d12, [x26], #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x8, #0, 176f\n"
"ld1 { v8.s }[2], [x16]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 176f\n"
"175:" // Height 6: Partial accumulate: partial_1_0
"ldr s8, [x16, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s12, [x26, #0x0]\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"176:" // Height 6: Partial accumulate: Done
"sub x16, x16, x20\n"
"b 179f\n"
@@ -2912,26 +2909,26 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr q9, [x16, #0x10]\n"
"ldr q10, [x16, #0x20]\n"
"ldr q11, [x16, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 179f\n"
"178:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -2962,8 +2959,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"mov x15, #0x0\n"
"180:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 181f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -3016,146 +3013,146 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v28.4s, v6.4s, v5.s[0]\n"
"ldr d6, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x10, x10, #0x10\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x9, x9, #0x10\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "mov v6.d[1], x21\n"
+ "add x10, x10, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x9, x9, #0x10\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"add x28, x28, #0x10\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
"ldr d7, [x17, #0x30]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr x27, [x13, #0x8]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr x26, [x12, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
+ "ldr x27, [x13, #0x8]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x12, #0x8]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
"ldr x25, [x11, #0x8]\n"
"fmla v30.4s, v6.4s, v5.s[0]\n"
"ldr d6, [x17, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr x24, [x10, #0x8]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
- "mov v6.d[1], x21\n"
+ "ldr x24, [x10, #0x8]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x23, [x9, #0x8]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
"ldr x22, [x28, #0x8]\n"
"fmla v31.4s, v7.4s, v5.s[0]\n"
"ldr d7, [x17, #0x50]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
- "sub x14, x14, #0x4\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
- "cmp x14, #0x8\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
+ "sub x14, x14, #0x4\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "cmp x14, #0x8\n"
"fmla v24.4s, v6.4s, v4.s[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v28.4s, v6.4s, v5.s[1]\n"
"ldr d6, [x17, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
- "mov v6.d[1], x21\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v25.4s, v7.4s, v4.s[1]\n"
"prfm pldl1keep, [x10, #0x80]\n"
"fmla v29.4s, v7.4s, v5.s[1]\n"
"ldr d7, [x17, #0x70]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.4s, v6.4s, v4.s[1]\n"
"fmla v30.4s, v6.4s, v5.s[1]\n"
"ldr d6, [x17, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
- "mov v6.d[1], x21\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr x21, [x17, #0xa8]\n"
"fmla v27.4s, v7.4s, v4.s[1]\n"
"fmla v31.4s, v7.4s, v5.s[1]\n"
"ldr d7, [x17, #0x90]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr x20, [x17, #0xb8]\n"
"fmla v24.4s, v6.4s, v4.s[2]\n"
"fmla v28.4s, v6.4s, v5.s[2]\n"
"ldr d6, [x17, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
- "mov v6.d[1], x21\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr x21, [x17, #0xc8]\n"
"fmla v25.4s, v7.4s, v4.s[2]\n"
"fmla v29.4s, v7.4s, v5.s[2]\n"
"ldr d7, [x17, #0xb0]\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr x20, [x17, #0xd8]\n"
"fmla v26.4s, v6.4s, v4.s[2]\n"
"fmla v30.4s, v6.4s, v5.s[2]\n"
"ldr d6, [x17, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
- "mov v6.d[1], x21\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr x21, [x17, #0xe8]\n"
"fmla v27.4s, v7.4s, v4.s[2]\n"
"fmla v31.4s, v7.4s, v5.s[2]\n"
"ldr d7, [x17, #0xd0]\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr x20, [x17, #0xf8]\n"
"fmla v24.4s, v6.4s, v4.s[3]\n"
"fmla v28.4s, v6.4s, v5.s[3]\n"
"ldr d6, [x17, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
- "mov v6.d[1], x21\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
"fmla v25.4s, v7.4s, v4.s[3]\n"
"fmla v29.4s, v7.4s, v5.s[3]\n"
"ldr d7, [x17, #0xf0]\n"
+ "mov v7.d[1], x20\n"
"add x17, x17, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
"ldr x21, [x17, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
- "ldr x20, [x17, #0x18]\n"
"fmla v26.4s, v6.4s, v4.s[3]\n"
"fmla v30.4s, v6.4s, v5.s[3]\n"
"ldr d6, [x17, #0x0]\n"
@@ -3317,8 +3314,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"ldr s3, [x9], #0x4\n"
"ldr s2, [x28], #0x4\n"
"ldr q1, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
"fmla v8.4s, v1.4s, v7.s[0]\n"
+ "ldr q0, [x17, #0x10]\n"
"fmla v12.4s, v1.4s, v6.s[0]\n"
"fmla v16.4s, v1.4s, v5.s[0]\n"
"fmla v20.4s, v1.4s, v4.s[0]\n"
@@ -3352,46 +3349,46 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"cmp x15, x20\n"
"bne 180b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x16, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x16, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 188f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmin v24.4s, v24.4s, v1.4s\n"
- "fmin v25.4s, v25.4s, v1.4s\n"
- "fmin v26.4s, v26.4s, v1.4s\n"
- "fmin v27.4s, v27.4s, v1.4s\n"
- "fmin v28.4s, v28.4s, v1.4s\n"
- "fmin v29.4s, v29.4s, v1.4s\n"
- "fmin v30.4s, v30.4s, v1.4s\n"
- "fmin v31.4s, v31.4s, v1.4s\n"
"fmax v8.4s, v8.4s, v0.4s\n"
"fmax v9.4s, v9.4s, v0.4s\n"
"fmax v10.4s, v10.4s, v0.4s\n"
@@ -3422,127 +3419,127 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"tbz x8, #3, 192f\n"
"st1 { v8.4s }, [x16], #0x10\n"
"st1 { v9.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
"tbz x8, #2, 190f\n"
"st1 { v10.4s }, [x16], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
"tbz x8, #1, 189f\n"
"str d11, [x16], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x8, #0, 196f\n"
"st1 { v11.s }[2], [x16]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 196f\n"
"189:" // Height 6: Partial direct writeback: partial_1_12
"tbz x8, #0, 196f\n"
"str s11, [x16, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"b 196f\n"
"190:" // Height 6: Partial direct writeback: partial_2_8
"tbz x8, #1, 191f\n"
"str d10, [x16], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x8, #0, 196f\n"
"st1 { v10.s }[2], [x16]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
"b 196f\n"
"191:" // Height 6: Partial direct writeback: partial_1_8
"tbz x8, #0, 196f\n"
"str s10, [x16, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
"b 196f\n"
"192:" // Height 6: Partial direct writeback: partial_4_0
"tbz x8, #2, 194f\n"
"st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
"tbz x8, #1, 193f\n"
"str d9, [x16], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x8, #0, 196f\n"
"st1 { v9.s }[2], [x16]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x22]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
"b 196f\n"
"193:" // Height 6: Partial direct writeback: partial_1_4
"tbz x8, #0, 196f\n"
"str s9, [x16, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x22, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
"b 196f\n"
"194:" // Height 6: Partial direct writeback: partial_2_0
"tbz x8, #1, 195f\n"
"str d8, [x16], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x8, #0, 196f\n"
"st1 { v8.s }[2], [x16]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x22]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
"b 196f\n"
"195:" // Height 6: Partial direct writeback: partial_1_0
"str s8, [x16, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x22, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
"196:" // Height 6: Partial direct writeback: Done
"b 198f\n"
"197:" // Height 6: Full writeback
@@ -3551,26 +3548,26 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"str q10, [x16, #0x20]\n"
"str q11, [x16, #0x30]\n"
"add x16, x16, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"198:" // Height 6: Writeback done
"subs x8, x8, #0x10\n"
"bgt 167b\n"
@@ -3586,8 +3583,8 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"200:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
index e411da6874..6c51c0ff3c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp32_mla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp32_mla_6x16 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -103,10 +101,10 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp %x[M], #0x2\n"
"bgt 67f\n"
"beq 34f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x12, 3f\n"
"ldr q8, [x12, #0x0]\n"
@@ -189,8 +187,8 @@ void a64_hybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -214,10 +212,6 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q17, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x4\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"ldr q17, [x10, #0x40]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
@@ -242,21 +236,22 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q17, [x10, #0xe0]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x4\n"
+ "add x26, x26, #0x10\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
- "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
+ "cmp x27, #0x8\n"
+ "add x10, x10, #0x100\n"
+ "ldr q6, [x10, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"ldr q17, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"ldr q17, [x10, #0x40]\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
@@ -281,23 +276,26 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q17, [x10, #0xe0]\n"
"fmla v9.4s, v16.4s, v0.s[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x4\n"
"fmla v10.4s, v17.4s, v0.s[3]\n"
"fmla v11.4s, v16.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x26], #0x4\n"
- "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v8.4s, v16.4s, v18.s[0]\n"
"sub x27, x27, #0x1\n"
- "ldr q16, [x10, #0x10]\n"
- "fmla v8.4s, v17.4s, v18.s[0]\n"
- "ldr q17, [x10, #0x20]\n"
- "fmla v9.4s, v16.4s, v18.s[0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "ldr q16, [x10, #0x20]\n"
+ "fmla v9.4s, v17.4s, v18.s[0]\n"
+ "fmla v10.4s, v16.4s, v18.s[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- "fmla v10.4s, v17.4s, v18.s[0]\n"
"fmla v11.4s, v16.4s, v18.s[0]\n"
+ "add x10, x10, #0x40\n"
"cbnz x27, 21b\n"
"22:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -306,9 +304,9 @@ void a64_hybrid_fp32_mla_6x16 (
"bne 15b\n"
"prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 23f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -378,95 +376,95 @@ void a64_hybrid_fp32_mla_6x16 (
"bgt 2b\n"
"b 200f\n"
"34:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"35:" // Height 2: Column loop
"cbz x12, 36f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "add x12, x12, #0x40\n"
"b 47f\n"
"36:" // Height 2: no bias
"tbz %x[flags], #0, 46f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"bge 45f\n"
"tbz x11, #3, 40f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x11, #2, 38f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x11, #1, 37f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
"tbz x11, #0, 44f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 44f\n"
"37:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 44f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 44f\n"
"38:" // Height 2: Partial accumulate: partial_2_8
"tbz x11, #1, 39f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
"tbz x11, #0, 44f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 44f\n"
"39:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 44f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 44f\n"
"40:" // Height 2: Partial accumulate: partial_4_0
"tbz x11, #2, 42f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x11, #1, 41f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
"tbz x11, #0, 44f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 44f\n"
"41:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 44f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 44f\n"
"42:" // Height 2: Partial accumulate: partial_2_0
"tbz x11, #1, 43f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
"tbz x11, #0, 44f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 44f\n"
"43:" // Height 2: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"44:" // Height 2: Partial accumulate: Done
"sub x9, x9, x20\n"
@@ -476,10 +474,10 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"b 47f\n"
"46:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
@@ -494,8 +492,8 @@ void a64_hybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -527,22 +525,22 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v13.4s, v7.4s, v1.s[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"fmla v14.4s, v17.4s, v1.s[0]\n"
"ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
"fmla v15.4s, v16.4s, v1.s[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "cmp x27, #0x8\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"fmla v12.4s, v17.4s, v1.s[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
"fmla v13.4s, v16.4s, v1.s[1]\n"
"ldr q16, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.4s, v17.4s, v0.s[1]\n"
"fmla v14.4s, v17.4s, v1.s[1]\n"
"ldr q17, [x10, #0x80]\n"
@@ -586,18 +584,18 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v13.4s, v7.4s, v1.s[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.4s, v17.4s, v0.s[0]\n"
"fmla v14.4s, v17.4s, v1.s[0]\n"
"ldr q17, [x10, #0x40]\n"
+ "sub x27, x27, #0x4\n"
"fmla v11.4s, v16.4s, v0.s[0]\n"
"fmla v15.4s, v16.4s, v1.s[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v8.4s, v17.4s, v0.s[1]\n"
"fmla v12.4s, v17.4s, v1.s[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v9.4s, v16.4s, v0.s[1]\n"
"fmla v13.4s, v16.4s, v1.s[1]\n"
"ldr q16, [x10, #0x70]\n"
@@ -644,9 +642,9 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v9.4s, v16.4s, v19.s[0]\n"
"fmla v13.4s, v16.4s, v18.s[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
"fmla v10.4s, v17.4s, v19.s[0]\n"
"fmla v14.4s, v17.4s, v18.s[0]\n"
+ "add x10, x10, #0x40\n"
"fmla v11.4s, v16.4s, v19.s[0]\n"
"fmla v15.4s, v16.4s, v18.s[0]\n"
"cbnz x27, 54b\n"
@@ -656,13 +654,13 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 48b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 56f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v17.4s\n"
"fmin v9.4s, v9.4s, v17.4s\n"
@@ -686,63 +684,63 @@ void a64_hybrid_fp32_mla_6x16 (
"tbz x11, #3, 60f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
"tbz x11, #2, 58f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
"tbz x11, #1, 57f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
+ "str d15, [x25], #0x8\n"
"tbz x11, #0, 64f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
+ "st1 { v15.s }[2], [x25]\n"
"b 64f\n"
"57:" // Height 2: Partial direct writeback: partial_1_12
"tbz x11, #0, 64f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
"b 64f\n"
"58:" // Height 2: Partial direct writeback: partial_2_8
"tbz x11, #1, 59f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
+ "str d14, [x25], #0x8\n"
"tbz x11, #0, 64f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
+ "st1 { v14.s }[2], [x25]\n"
"b 64f\n"
"59:" // Height 2: Partial direct writeback: partial_1_8
"tbz x11, #0, 64f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
"b 64f\n"
"60:" // Height 2: Partial direct writeback: partial_4_0
"tbz x11, #2, 62f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
"tbz x11, #1, 61f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
+ "str d13, [x25], #0x8\n"
"tbz x11, #0, 64f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
+ "st1 { v13.s }[2], [x25]\n"
"b 64f\n"
"61:" // Height 2: Partial direct writeback: partial_1_4
"tbz x11, #0, 64f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
"b 64f\n"
"62:" // Height 2: Partial direct writeback: partial_2_0
"tbz x11, #1, 63f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x11, #0, 64f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
+ "st1 { v12.s }[2], [x25]\n"
"b 64f\n"
"63:" // Height 2: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
"64:" // Height 2: Partial direct writeback: Done
"b 66f\n"
"65:" // Height 2: Full writeback
@@ -751,126 +749,126 @@ void a64_hybrid_fp32_mla_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
"66:" // Height 2: Writeback done
"subs x11, x11, #0x10\n"
"bgt 35b\n"
"b 200f\n"
"67:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"68:" // Height 3: Column loop
"cbz x12, 69f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"b 80f\n"
"69:" // Height 3: no bias
"tbz %x[flags], #0, 79f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 78f\n"
"tbz x11, #3, 73f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #2, 71f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #1, 70f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x11, #0, 77f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 77f\n"
"70:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 77f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 77f\n"
"71:" // Height 3: Partial accumulate: partial_2_8
"tbz x11, #1, 72f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x11, #0, 77f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 77f\n"
"72:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 77f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 77f\n"
"73:" // Height 3: Partial accumulate: partial_4_0
"tbz x11, #2, 75f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
"tbz x11, #1, 74f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x11, #0, 77f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 77f\n"
"74:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 77f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"b 77f\n"
"75:" // Height 3: Partial accumulate: partial_2_0
"tbz x11, #1, 76f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
"tbz x11, #0, 77f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
"b 77f\n"
"76:" // Height 3: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
"77:" // Height 3: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 80f\n"
@@ -879,14 +877,14 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
"b 80f\n"
"79:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
@@ -905,8 +903,8 @@ void a64_hybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"81:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 82f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -946,18 +944,18 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v17.4s, v7.4s, v2.s[0]\n"
"ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "cmp x27, #0x8\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.4s, v21.4s, v0.s[0]\n"
"fmla v14.4s, v21.4s, v1.s[0]\n"
+ "cmp x27, #0x8\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v18.4s, v21.4s, v2.s[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v11.4s, v20.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v15.4s, v20.4s, v1.s[0]\n"
"fmla v19.4s, v20.4s, v2.s[0]\n"
"ldr q20, [x10, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v8.4s, v21.4s, v0.s[1]\n"
"fmla v12.4s, v21.4s, v1.s[1]\n"
"fmla v16.4s, v21.4s, v2.s[1]\n"
@@ -1024,14 +1022,14 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v17.4s, v7.4s, v2.s[0]\n"
"ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.4s, v21.4s, v0.s[0]\n"
"fmla v14.4s, v21.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v18.4s, v21.4s, v2.s[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v11.4s, v20.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v15.4s, v20.4s, v1.s[0]\n"
"fmla v19.4s, v20.4s, v2.s[0]\n"
"ldr q20, [x10, #0x50]\n"
@@ -1090,9 +1088,9 @@ void a64_hybrid_fp32_mla_6x16 (
"sub x27, x27, #0x1\n"
"ldr s22, [x24], #0x4\n"
"ldr q21, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
"fmla v8.4s, v21.4s, v24.s[0]\n"
"fmla v12.4s, v21.4s, v23.s[0]\n"
+ "ldr q20, [x10, #0x10]\n"
"fmla v16.4s, v21.4s, v22.s[0]\n"
"ldr q21, [x10, #0x20]\n"
"fmla v9.4s, v20.4s, v24.s[0]\n"
@@ -1113,15 +1111,15 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 81b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 89f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v21.4s }, [x21]\n"
"ld1r { v20.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v21.4s\n"
"fmin v9.4s, v9.4s, v21.4s\n"
@@ -1153,79 +1151,79 @@ void a64_hybrid_fp32_mla_6x16 (
"tbz x11, #3, 93f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #2, 91f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #1, 90f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x11, #0, 97f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 97f\n"
"90:" // Height 3: Partial direct writeback: partial_1_12
"tbz x11, #0, 97f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 97f\n"
"91:" // Height 3: Partial direct writeback: partial_2_8
"tbz x11, #1, 92f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x11, #0, 97f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 97f\n"
"92:" // Height 3: Partial direct writeback: partial_1_8
"tbz x11, #0, 97f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 97f\n"
"93:" // Height 3: Partial direct writeback: partial_4_0
"tbz x11, #2, 95f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x11, #1, 94f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x11, #0, 97f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 97f\n"
"94:" // Height 3: Partial direct writeback: partial_1_4
"tbz x11, #0, 97f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 97f\n"
"95:" // Height 3: Partial direct writeback: partial_2_0
"tbz x11, #1, 96f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x11, #0, 97f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 97f\n"
"96:" // Height 3: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"97:" // Height 3: Partial direct writeback: Done
"b 99f\n"
"98:" // Height 3: Full writeback
@@ -1234,39 +1232,39 @@ void a64_hybrid_fp32_mla_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"99:" // Height 3: Writeback done
"subs x11, x11, #0x10\n"
"bgt 68b\n"
"b 200f\n"
"100:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"101:" // Height 4: Column loop
"cbz x12, 102f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1274,111 +1272,111 @@ void a64_hybrid_fp32_mla_6x16 (
"102:" // Height 4: no bias
"tbz %x[flags], #0, 112f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 111f\n"
"tbz x11, #3, 106f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x11, #2, 104f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x11, #1, 103f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x11, #0, 110f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 110f\n"
"103:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 110f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 110f\n"
"104:" // Height 4: Partial accumulate: partial_2_8
"tbz x11, #1, 105f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x11, #0, 110f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 110f\n"
"105:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 110f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 110f\n"
"106:" // Height 4: Partial accumulate: partial_4_0
"tbz x11, #2, 108f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x11, #1, 107f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x11, #0, 110f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 110f\n"
"107:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 110f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 110f\n"
"108:" // Height 4: Partial accumulate: partial_2_0
"tbz x11, #1, 109f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x11, #0, 110f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 110f\n"
"109:" // Height 4: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"110:" // Height 4: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 113f\n"
@@ -1387,18 +1385,18 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"b 113f\n"
"112:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
@@ -1421,8 +1419,8 @@ void a64_hybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"114:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1562,14 +1560,14 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v10.4s, v25.4s, v0.s[0]\n"
"fmla v14.4s, v25.4s, v1.s[0]\n"
- "sub x27, x27, #0x4\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"fmla v18.4s, v25.4s, v2.s[0]\n"
"fmla v22.4s, v25.4s, v3.s[0]\n"
@@ -1675,17 +1673,17 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 114b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v25.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v25.4s }, [x21]\n"
"ld1r { v24.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v25.4s\n"
"fmin v9.4s, v9.4s, v25.4s\n"
@@ -1725,95 +1723,95 @@ void a64_hybrid_fp32_mla_6x16 (
"tbz x11, #3, 126f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
"tbz x11, #2, 124f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
"tbz x11, #1, 123f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
"tbz x11, #0, 130f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
"b 130f\n"
"123:" // Height 4: Partial direct writeback: partial_1_12
"tbz x11, #0, 130f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
"b 130f\n"
"124:" // Height 4: Partial direct writeback: partial_2_8
"tbz x11, #1, 125f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
"tbz x11, #0, 130f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
"b 130f\n"
"125:" // Height 4: Partial direct writeback: partial_1_8
"tbz x11, #0, 130f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
"b 130f\n"
"126:" // Height 4: Partial direct writeback: partial_4_0
"tbz x11, #2, 128f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
"tbz x11, #1, 127f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
"tbz x11, #0, 130f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
"b 130f\n"
"127:" // Height 4: Partial direct writeback: partial_1_4
"tbz x11, #0, 130f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
"b 130f\n"
"128:" // Height 4: Partial direct writeback: partial_2_0
"tbz x11, #1, 129f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x11, #0, 130f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
"b 130f\n"
"129:" // Height 4: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
"130:" // Height 4: Partial direct writeback: Done
"b 132f\n"
"131:" // Height 4: Full writeback
@@ -1822,43 +1820,43 @@ void a64_hybrid_fp32_mla_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
"132:" // Height 4: Writeback done
"subs x11, x11, #0x10\n"
"bgt 101b\n"
"b 200f\n"
"133:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"134:" // Height 5: Column loop
"cbz x12, 135f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -1870,128 +1868,128 @@ void a64_hybrid_fp32_mla_6x16 (
"135:" // Height 5: no bias
"tbz %x[flags], #0, 145f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 144f\n"
"tbz x11, #3, 139f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #2, 137f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #1, 136f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x11, #0, 143f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 143f\n"
"136:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 143f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 143f\n"
"137:" // Height 5: Partial accumulate: partial_2_8
"tbz x11, #1, 138f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x11, #0, 143f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 143f\n"
"138:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 143f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 143f\n"
"139:" // Height 5: Partial accumulate: partial_4_0
"tbz x11, #2, 141f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x11, #1, 140f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x11, #0, 143f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 143f\n"
"140:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 143f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 143f\n"
"141:" // Height 5: Partial accumulate: partial_2_0
"tbz x11, #1, 142f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x11, #0, 143f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 143f\n"
"142:" // Height 5: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"143:" // Height 5: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 146f\n"
@@ -2000,22 +1998,22 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
"b 146f\n"
"145:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
@@ -2042,8 +2040,8 @@ void a64_hybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"147:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 148f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2209,12 +2207,12 @@ void a64_hybrid_fp32_mla_6x16 (
"add x22, x22, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
+ "sub x27, x27, #0x4\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x4\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.4s, v29.4s, v0.s[0]\n"
"fmla v14.4s, v29.4s, v1.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
@@ -2311,9 +2309,9 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr s31, [x23], #0x4\n"
"ldr s30, [x22], #0x4\n"
"ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
"fmla v8.4s, v29.4s, v2.s[0]\n"
"fmla v12.4s, v29.4s, v1.s[0]\n"
+ "ldr q28, [x10, #0x10]\n"
"fmla v16.4s, v29.4s, v0.s[0]\n"
"fmla v20.4s, v29.4s, v31.s[0]\n"
"fmla v24.4s, v29.4s, v30.s[0]\n"
@@ -2342,19 +2340,19 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 147b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 155f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v29.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v29.4s }, [x21]\n"
"ld1r { v28.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v29.4s\n"
"fmin v9.4s, v9.4s, v29.4s\n"
@@ -2402,111 +2400,111 @@ void a64_hybrid_fp32_mla_6x16 (
"tbz x11, #3, 159f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #2, 157f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #1, 156f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x11, #0, 163f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 163f\n"
"156:" // Height 5: Partial direct writeback: partial_1_12
"tbz x11, #0, 163f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 163f\n"
"157:" // Height 5: Partial direct writeback: partial_2_8
"tbz x11, #1, 158f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x11, #0, 163f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 163f\n"
"158:" // Height 5: Partial direct writeback: partial_1_8
"tbz x11, #0, 163f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 163f\n"
"159:" // Height 5: Partial direct writeback: partial_4_0
"tbz x11, #2, 161f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x11, #1, 160f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x11, #0, 163f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 163f\n"
"160:" // Height 5: Partial direct writeback: partial_1_4
"tbz x11, #0, 163f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 163f\n"
"161:" // Height 5: Partial direct writeback: partial_2_0
"tbz x11, #1, 162f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x11, #0, 163f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 163f\n"
"162:" // Height 5: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"163:" // Height 5: Partial direct writeback: Done
"b 165f\n"
"164:" // Height 5: Full writeback
@@ -2515,51 +2513,50 @@ void a64_hybrid_fp32_mla_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"165:" // Height 5: Writeback done
"subs x11, x11, #0x10\n"
"bgt 134b\n"
"b 200f\n"
"166:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"167:" // Height 6: Column loop
"cbz x12, 168f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"mov v12.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v16.16b, v8.16b\n"
- "mov v17.16b, v9.16b\n"
- "mov v20.16b, v8.16b\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "add x12, x12, #0x40\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
+ "mov v20.16b, v8.16b\n"
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
@@ -2575,145 +2572,145 @@ void a64_hybrid_fp32_mla_6x16 (
"168:" // Height 6: no bias
"tbz %x[flags], #0, 178f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 177f\n"
"tbz x11, #3, 172f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x11, #2, 170f\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x11, #1, 169f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x11, #0, 176f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 176f\n"
"169:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 176f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 176f\n"
"170:" // Height 6: Partial accumulate: partial_2_8
"tbz x11, #1, 171f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x11, #0, 176f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 176f\n"
"171:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 176f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 176f\n"
"172:" // Height 6: Partial accumulate: partial_4_0
"tbz x11, #2, 174f\n"
"ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v16.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x11, #1, 173f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x11, #0, 176f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 176f\n"
"173:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 176f\n"
"ldr s9, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 176f\n"
"174:" // Height 6: Partial accumulate: partial_2_0
"tbz x11, #1, 175f\n"
"ldr d8, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d16, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x11, #0, 176f\n"
"ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v16.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 176f\n"
"175:" // Height 6: Partial accumulate: partial_1_0
"ldr s8, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s16, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"176:" // Height 6: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 179f\n"
@@ -2722,26 +2719,26 @@ void a64_hybrid_fp32_mla_6x16 (
"ldr q9, [x9, #0x10]\n"
"ldr q10, [x9, #0x20]\n"
"ldr q11, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q16, [x25, #0x0]\n"
- "ldr q17, [x25, #0x10]\n"
- "ldr q18, [x25, #0x20]\n"
- "ldr q19, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"b 179f\n"
"178:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
@@ -2772,8 +2769,8 @@ void a64_hybrid_fp32_mla_6x16 (
"mov x28, #0x0\n"
"180:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 181f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2963,18 +2960,18 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
"ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "sub x27, x27, #0x4\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
@@ -3118,21 +3115,21 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp x28, x20\n"
"bne 180b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x26, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x21, x22, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 188f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v1.4s\n"
"fmin v9.4s, v9.4s, v1.4s\n"
@@ -3188,127 +3185,127 @@ void a64_hybrid_fp32_mla_6x16 (
"tbz x11, #3, 192f\n"
"st1 { v8.4s }, [x9], #0x10\n"
"st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v13.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
"tbz x11, #2, 190f\n"
"st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
- "st1 { v22.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x22], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
"tbz x11, #1, 189f\n"
"str d11, [x9], #0x8\n"
- "str d15, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
- "str d23, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
- "st1 { v23.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 196f\n"
"189:" // Height 6: Partial direct writeback: partial_1_12
"tbz x11, #0, 196f\n"
"str s11, [x9, #0x0]\n"
- "str s15, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
- "str s23, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"b 196f\n"
"190:" // Height 6: Partial direct writeback: partial_2_8
"tbz x11, #1, 191f\n"
"str d10, [x9], #0x8\n"
- "str d14, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
- "str d22, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
- "st1 { v22.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
"b 196f\n"
"191:" // Height 6: Partial direct writeback: partial_1_8
"tbz x11, #0, 196f\n"
"str s10, [x9, #0x0]\n"
- "str s14, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
"b 196f\n"
"192:" // Height 6: Partial direct writeback: partial_4_0
"tbz x11, #2, 194f\n"
"st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
"tbz x11, #1, 193f\n"
"str d9, [x9], #0x8\n"
- "str d13, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
- "str d21, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x22], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x22]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
"b 196f\n"
"193:" // Height 6: Partial direct writeback: partial_1_4
"tbz x11, #0, 196f\n"
"str s9, [x9, #0x0]\n"
- "str s13, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
- "str s21, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x22, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
"b 196f\n"
"194:" // Height 6: Partial direct writeback: partial_2_0
"tbz x11, #1, 195f\n"
"str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x11, #0, 196f\n"
"st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
- "st1 { v20.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x22]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
"b 196f\n"
"195:" // Height 6: Partial direct writeback: partial_1_0
"str s8, [x9, #0x0]\n"
- "str s12, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
- "str s20, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x22, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
"196:" // Height 6: Partial direct writeback: Done
"b 198f\n"
"197:" // Height 6: Full writeback
@@ -3317,26 +3314,26 @@ void a64_hybrid_fp32_mla_6x16 (
"str q10, [x9, #0x20]\n"
"str q11, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q12, [x26, #0x0]\n"
- "str q13, [x26, #0x10]\n"
- "str q14, [x26, #0x20]\n"
- "str q15, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
- "str q20, [x24, #0x0]\n"
- "str q21, [x24, #0x10]\n"
- "str q22, [x24, #0x20]\n"
- "str q23, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x22, #0x0]\n"
- "str q29, [x22, #0x10]\n"
- "str q30, [x22, #0x20]\n"
- "str q31, [x22, #0x30]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
"198:" // Height 6: Writeback done
"subs x11, x11, #0x10\n"
"bgt 167b\n"
@@ -3352,8 +3349,8 @@ void a64_hybrid_fp32_mla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"200:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
index e8f7cdf329..97b6c3f8e0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -70,7 +70,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 4, 1> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 8, 4, 1> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_hybrid_fp32_mla_8x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
index f8b117c546..cf1d3070a8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp32_mla_8x4_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp32_mla_8x4_a55 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -106,10 +104,10 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"cmp %x[M], #0x2\n"
"bgt 43f\n"
"beq 22f\n"
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x3, 3f\n"
"ldr q24, [x3, #0x0]\n"
@@ -121,15 +119,15 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 6f\n"
"tbz x4, #1, 4f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
+ "mov x26, #0x8\n"
"tbz x4, #0, 5f\n"
"ld1 { v24.s }[2], [x6]\n"
"b 5f\n"
"4:" // Height 1: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
+ "mov x26, #0x0\n"
"5:" // Height 1: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 8f\n"
"6:" // Height 1: full accumulate
"ldr q24, [x6, #0x0]\n"
@@ -139,16 +137,16 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"8:" // Height 1: setup done
"mov x7, #0x0\n"
"9:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 10f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
"cbnz x7, 11f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
"b 11f\n"
"10:" // Height 1: setup direct input
"mov x17, %x[input_ptr]\n"
@@ -165,59 +163,59 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"12:" // Height 1: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
"add x17, x17, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x5, x5, #0x40\n"
- "ldr x23, [x17, #0x8]\n"
"ldr d8, [x5, #0x0]\n"
- "sub x8, x8, #0x4\n"
- "ldr x20, [x5, #0x8]\n"
- "cmp x8, #0x8\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
- "ldr d9, [x5, #0x10]\n"
- "ldr x22, [x5, #0x18]\n"
- "ldr x21, [x5, #0x28]\n"
- "mov v8.d[1], x20\n"
- "ldr x20, [x5, #0x38]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr d10, [x5, #0x20]\n"
- "prfm pldl1keep, [x17, #0x80]\n"
- "mov v9.d[1], x22\n"
- "mov v10.d[1], x21\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr d0, [x17, #0x0]\n"
+ "sub x8, x8, #0x4\n"
+ "ldr d10, [x5, #0x20]\n"
+ "cmp x8, #0x8\n"
"ldr d11, [x5, #0x30]\n"
- "mov v0.d[1], x23\n"
- "mov v11.d[1], x20\n"
+ "ldr x26, [x5, #0x8]\n"
+ "mov v8.d[1], x26\n"
+ "ldr x26, [x5, #0x18]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x17, #0x8]\n"
+ "mov v0.d[1], x26\n"
+ "ldr x26, [x5, #0x28]\n"
+ "mov v10.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"bge 12b\n"
"13:" // Height 1: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
"add x17, x17, #0x10\n"
- "sub x8, x8, #0x4\n"
- "prfm pldl1keep, [x17, #0x80]\n"
- "add x5, x5, #0x40\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
+ "sub x8, x8, #0x4\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
+ "add x5, x5, #0x40\n"
"14:" // Height 1: Multiply loop: Main loop skip
"cbz x8, 16f\n"
"15:" // Height 1: Multiply loop: Odd block loop
"ldr s17, [x17], #0x4\n"
"sub x8, x8, #0x1\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v17.s[0]\n"
+ "add x5, x5, #0x10\n"
"cbnz x8, 15b\n"
"16:" // Height 1: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 9b\n"
"prfm pstl1keep, [x6, #0x0]\n"
"tbz %x[flags], #1, 17f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"17:" // Height 1: No activation
"cmp x4, #0x4\n"
@@ -239,40 +237,40 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bgt 2b\n"
"b 170f\n"
"22:" // Height 2
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"23:" // Height 2: Column loop
"cbz x3, 24f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"b 29f\n"
"24:" // Height 2: no bias
"tbz %x[flags], #0, 28f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
+ "add x13, x6, x26, LSL #2\n"
"bge 27f\n"
"tbz x4, #1, 25f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
"tbz x4, #0, 26f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
+ "ld1 { v25.s }[2], [x13]\n"
"b 26f\n"
"25:" // Height 2: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
"26:" // Height 2: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 29f\n"
"27:" // Height 2: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
"b 29f\n"
"28:" // Height 2: no accumulate
"movi v24.16b, #0x0\n"
@@ -280,22 +278,22 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"29:" // Height 2: setup done
"mov x7, #0x0\n"
"30:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
"cbnz x7, 32f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
"b 32f\n"
"31:" // Height 2: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
"32:" // Height 2: input setup done
"cmp x8, #0x4\n"
"blt 35f\n"
@@ -312,49 +310,49 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
"add x16, x16, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x5, x5, #0x40\n"
- "ldr x24, [x17, #0x8]\n"
"ldr d8, [x5, #0x0]\n"
- "sub x8, x8, #0x4\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
- "ldr x20, [x5, #0x8]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
"ldr d9, [x5, #0x10]\n"
- "ldr x23, [x5, #0x18]\n"
- "cmp x8, #0x8\n"
- "ldr x22, [x5, #0x28]\n"
- "mov v8.d[1], x20\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x21, [x16, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
"ldr d10, [x5, #0x20]\n"
- "ldr x20, [x5, #0x38]\n"
- "mov v9.d[1], x23\n"
- "prfm pldl1keep, [x17, #0x80]\n"
+ "ldr x27, [x5, #0x8]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"ldr d1, [x16, #0x0]\n"
+ "sub x8, x8, #0x4\n"
"ldr d11, [x5, #0x30]\n"
- "mov v10.d[1], x22\n"
+ "cmp x8, #0x8\n"
+ "ldr x26, [x5, #0x18]\n"
+ "mov v8.d[1], x27\n"
+ "ldr x27, [x17, #0x8]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "ldr x27, [x5, #0x28]\n"
+ "mov v1.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v10.d[1], x27\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"prfm pldl1keep, [x16, #0x80]\n"
- "mov v0.d[1], x24\n"
- "mov v1.d[1], x21\n"
- "mov v11.d[1], x20\n"
"bge 33b\n"
"34:" // Height 2: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
"add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
"add x16, x16, #0x10\n"
- "sub x8, x8, #0x4\n"
- "prfm pldl1keep, [x17, #0x80]\n"
- "add x5, x5, #0x40\n"
- "prfm pldl1keep, [x16, #0x80]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "add x5, x5, #0x40\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"35:" // Height 2: Multiply loop: Main loop skip
@@ -364,26 +362,26 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"sub x8, x8, #0x1\n"
"ldr s17, [x16], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v18.s[0]\n"
"fmla v25.4s, v16.4s, v17.s[0]\n"
+ "add x5, x5, #0x10\n"
"cbnz x8, 36b\n"
"37:" // Height 2: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 30b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"38:" // Height 2: No activation
@@ -391,65 +389,65 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 41f\n"
"tbz x4, #1, 39f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
+ "str d25, [x13], #0x8\n"
"tbz x4, #0, 40f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
+ "st1 { v25.s }[2], [x13]\n"
"b 40f\n"
"39:" // Height 2: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
"40:" // Height 2: Partial direct writeback: Done
"b 42f\n"
"41:" // Height 2: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
"42:" // Height 2: Writeback done
"subs x4, x4, #0x4\n"
"bgt 23b\n"
"b 170f\n"
"43:" // Height 3
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"44:" // Height 3: Column loop
"cbz x3, 45f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"b 50f\n"
"45:" // Height 3: no bias
"tbz %x[flags], #0, 49f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
"bge 48f\n"
"tbz x4, #1, 46f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
"tbz x4, #0, 47f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
"b 47f\n"
"46:" // Height 3: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
"47:" // Height 3: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 50f\n"
"48:" // Height 3: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
"b 50f\n"
"49:" // Height 3: no accumulate
"movi v24.16b, #0x0\n"
@@ -458,25 +456,25 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"50:" // Height 3: setup done
"mov x7, #0x0\n"
"51:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 52f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
- "ldr x15, [x20, #0x10]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
"cbnz x7, 53f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
- "add x15, x15, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
"b 53f\n"
"52:" // Height 3: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
- "add x15, x16, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
"53:" // Height 3: input setup done
"cmp x8, #0x4\n"
"blt 56f\n"
@@ -496,42 +494,42 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
"add x15, x15, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x5, x5, #0x40\n"
- "ldr x25, [x17, #0x8]\n"
"ldr d8, [x5, #0x0]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "ldr x20, [x5, #0x8]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
"ldr d9, [x5, #0x10]\n"
- "ldr x24, [x5, #0x18]\n"
- "sub x8, x8, #0x4\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x23, [x5, #0x28]\n"
+ "ldr x28, [x5, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x22, [x16, #0x8]\n"
+ "ldr x27, [x5, #0x18]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"ldr d10, [x5, #0x20]\n"
- "ldr x21, [x15, #0x8]\n"
- "cmp x8, #0x8\n"
+ "ldr x26, [x5, #0x28]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"ldr d1, [x16, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"ldr d2, [x15, #0x0]\n"
+ "sub x8, x8, #0x4\n"
"ldr d11, [x5, #0x30]\n"
- "mov v8.d[1], x20\n"
- "ldr x20, [x5, #0x38]\n"
- "mov v9.d[1], x24\n"
+ "cmp x8, #0x8\n"
+ "ldr x9, [x17, #0x8]\n"
+ "mov v8.d[1], x28\n"
+ "ldr x28, [x16, #0x8]\n"
+ "mov v9.d[1], x27\n"
+ "ldr x27, [x15, #0x8]\n"
+ "mov v10.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v0.d[1], x9\n"
+ "mov v1.d[1], x28\n"
"prfm pldl1keep, [x17, #0x80]\n"
- "mov v10.d[1], x23\n"
+ "mov v2.d[1], x27\n"
"prfm pldl1keep, [x16, #0x80]\n"
- "mov v0.d[1], x25\n"
+ "mov v11.d[1], x26\n"
"prfm pldl1keep, [x15, #0x80]\n"
- "mov v1.d[1], x22\n"
- "mov v2.d[1], x21\n"
- "mov v11.d[1], x20\n"
"bge 54b\n"
"55:" // Height 3: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -540,16 +538,16 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
"add x15, x15, #0x10\n"
- "sub x8, x8, #0x4\n"
- "prfm pldl1keep, [x17, #0x80]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x16, #0x80]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x5, x5, #0x40\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x15, #0x80]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -562,30 +560,30 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr s18, [x16], #0x4\n"
"ldr s17, [x15], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v19.s[0]\n"
"fmla v25.4s, v16.4s, v18.s[0]\n"
+ "add x5, x5, #0x10\n"
"fmla v26.4s, v16.4s, v17.s[0]\n"
"cbnz x8, 57b\n"
"58:" // Height 3: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 51b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
"tbz %x[flags], #1, 59f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v16.4s\n"
@@ -594,75 +592,75 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 62f\n"
"tbz x4, #1, 60f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
"tbz x4, #0, 61f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
"b 61f\n"
"60:" // Height 3: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
"61:" // Height 3: Partial direct writeback: Done
"b 63f\n"
"62:" // Height 3: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
"63:" // Height 3: Writeback done
"subs x4, x4, #0x4\n"
"bgt 44b\n"
"b 170f\n"
"64:" // Height 4
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"65:" // Height 4: Column loop
"cbz x3, 66f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"b 71f\n"
"66:" // Height 4: no bias
"tbz %x[flags], #0, 70f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
"bge 69f\n"
"tbz x4, #1, 67f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
"tbz x4, #0, 68f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
"b 68f\n"
"67:" // Height 4: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
"68:" // Height 4: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 71f\n"
"69:" // Height 4: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
"b 71f\n"
"70:" // Height 4: no accumulate
"movi v24.16b, #0x0\n"
@@ -672,28 +670,28 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"71:" // Height 4: setup done
"mov x7, #0x0\n"
"72:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
- "ldr x15, [x20, #0x10]\n"
- "ldr x14, [x20, #0x18]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
"cbnz x7, 74f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
- "add x15, x15, x20, LSL #2\n"
- "add x14, x14, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
"b 74f\n"
"73:" // Height 4: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
- "add x15, x16, x21, LSL #2\n"
- "add x14, x15, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
"74:" // Height 4: input setup done
"cmp x8, #0x4\n"
"blt 77f\n"
@@ -721,22 +719,22 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "ldr x21, [x5, #0x8]\n"
+ "ldr x27, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
"ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x20, [x5, #0x18]\n"
+ "ldr x26, [x5, #0x18]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x25, [x5, #0x28]\n"
+ "ldr x11, [x5, #0x28]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x24, [x17, #0x8]\n"
+ "ldr x10, [x17, #0x8]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"ldr d1, [x16, #0x0]\n"
- "ldr x23, [x16, #0x8]\n"
+ "ldr x9, [x16, #0x8]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
@@ -744,21 +742,21 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"sub x8, x8, #0x4\n"
"ldr d11, [x5, #0x30]\n"
"cmp x8, #0x8\n"
- "ldr x22, [x15, #0x8]\n"
- "mov v8.d[1], x21\n"
- "ldr x21, [x14, #0x8]\n"
- "mov v9.d[1], x20\n"
- "ldr x20, [x5, #0x38]\n"
- "mov v10.d[1], x25\n"
+ "ldr x28, [x15, #0x8]\n"
+ "mov v8.d[1], x27\n"
+ "ldr x27, [x14, #0x8]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v10.d[1], x11\n"
"prfm pldl1keep, [x17, #0x80]\n"
- "mov v0.d[1], x24\n"
+ "mov v0.d[1], x10\n"
"prfm pldl1keep, [x16, #0x80]\n"
- "mov v1.d[1], x23\n"
- "mov v2.d[1], x22\n"
+ "mov v1.d[1], x9\n"
+ "mov v2.d[1], x28\n"
"prfm pldl1keep, [x15, #0x80]\n"
- "mov v3.d[1], x21\n"
+ "mov v3.d[1], x27\n"
"prfm pldl1keep, [x14, #0x80]\n"
- "mov v11.d[1], x20\n"
+ "mov v11.d[1], x26\n"
"bge 75b\n"
"76:" // Height 4: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -796,34 +794,34 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr s18, [x15], #0x4\n"
"ldr s17, [x14], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v20.s[0]\n"
"fmla v25.4s, v16.4s, v19.s[0]\n"
+ "add x5, x5, #0x10\n"
"fmla v26.4s, v16.4s, v18.s[0]\n"
"fmla v27.4s, v16.4s, v17.s[0]\n"
"cbnz x8, 78b\n"
"79:" // Height 4: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 72b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v16.4s\n"
@@ -833,85 +831,85 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 83f\n"
"tbz x4, #1, 81f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
"tbz x4, #0, 82f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
"b 82f\n"
"81:" // Height 4: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
"82:" // Height 4: Partial direct writeback: Done
"b 84f\n"
"83:" // Height 4: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
"84:" // Height 4: Writeback done
"subs x4, x4, #0x4\n"
"bgt 65b\n"
"b 170f\n"
"85:" // Height 5
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"86:" // Height 5: Column loop
"cbz x3, 87f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"b 92f\n"
"87:" // Height 5: no bias
"tbz %x[flags], #0, 91f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
"bge 90f\n"
"tbz x4, #1, 88f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
"tbz x4, #0, 89f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
"b 89f\n"
"88:" // Height 5: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
"89:" // Height 5: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 92f\n"
"90:" // Height 5: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
"b 92f\n"
"91:" // Height 5: no accumulate
"movi v24.16b, #0x0\n"
@@ -922,31 +920,31 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"92:" // Height 5: setup done
"mov x7, #0x0\n"
"93:" // Height 5: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 94f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
- "ldr x15, [x20, #0x10]\n"
- "ldr x14, [x20, #0x18]\n"
- "ldr x13, [x20, #0x20]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
"cbnz x7, 95f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
- "add x15, x15, x20, LSL #2\n"
- "add x14, x14, x20, LSL #2\n"
- "add x13, x13, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
"b 95f\n"
"94:" // Height 5: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
- "add x15, x16, x21, LSL #2\n"
- "add x14, x15, x21, LSL #2\n"
- "add x13, x14, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
"95:" // Height 5: input setup done
"cmp x8, #0x4\n"
"blt 98f\n"
@@ -977,19 +975,19 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "ldr x20, [x5, #0x8]\n"
+ "ldr x27, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "ldr x27, [x5, #0x18]\n"
+ "ldr x26, [x5, #0x18]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
"ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x26, [x5, #0x28]\n"
+ "ldr x12, [x5, #0x28]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x25, [x17, #0x8]\n"
+ "ldr x11, [x17, #0x8]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x24, [x16, #0x8]\n"
+ "ldr x10, [x16, #0x8]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "ldr x23, [x15, #0x8]\n"
+ "ldr x9, [x15, #0x8]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
"ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1000,28 +998,28 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
"ldr d3, [x14, #0x0]\n"
- "ldr x22, [x14, #0x8]\n"
+ "ldr x28, [x14, #0x8]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
"ldr d4, [x13, #0x0]\n"
"sub x8, x8, #0x4\n"
"ldr d11, [x5, #0x30]\n"
"cmp x8, #0x8\n"
- "ldr x21, [x13, #0x8]\n"
- "mov v8.d[1], x20\n"
- "ldr x20, [x5, #0x38]\n"
- "mov v9.d[1], x27\n"
+ "mov v8.d[1], x27\n"
+ "ldr x27, [x13, #0x8]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
"prfm pldl1keep, [x17, #0x80]\n"
- "mov v10.d[1], x26\n"
+ "mov v10.d[1], x12\n"
"prfm pldl1keep, [x16, #0x80]\n"
- "mov v0.d[1], x25\n"
+ "mov v0.d[1], x11\n"
"prfm pldl1keep, [x15, #0x80]\n"
- "mov v1.d[1], x24\n"
+ "mov v1.d[1], x10\n"
"prfm pldl1keep, [x14, #0x80]\n"
- "mov v2.d[1], x23\n"
- "mov v3.d[1], x22\n"
+ "mov v2.d[1], x9\n"
+ "mov v3.d[1], x28\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v4.d[1], x21\n"
- "mov v11.d[1], x20\n"
+ "mov v4.d[1], x27\n"
+ "mov v11.d[1], x26\n"
"bge 96b\n"
"97:" // Height 5: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -1066,38 +1064,38 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr s18, [x14], #0x4\n"
"ldr s17, [x13], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v21.s[0]\n"
"fmla v25.4s, v16.4s, v20.s[0]\n"
+ "add x5, x5, #0x10\n"
"fmla v26.4s, v16.4s, v19.s[0]\n"
"fmla v27.4s, v16.4s, v18.s[0]\n"
"fmla v28.4s, v16.4s, v17.s[0]\n"
"cbnz x8, 99b\n"
"100:" // Height 5: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 93b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
"tbz %x[flags], #1, 101f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v16.4s\n"
@@ -1108,95 +1106,95 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 104f\n"
"tbz x4, #1, 102f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
"tbz x4, #0, 103f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
"b 103f\n"
"102:" // Height 5: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
"103:" // Height 5: Partial direct writeback: Done
"b 105f\n"
"104:" // Height 5: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
"105:" // Height 5: Writeback done
"subs x4, x4, #0x4\n"
"bgt 86b\n"
"b 170f\n"
"106:" // Height 6
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"107:" // Height 6: Column loop
"cbz x3, 108f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
"b 113f\n"
"108:" // Height 6: no bias
"tbz %x[flags], #0, 112f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
"bge 111f\n"
"tbz x4, #1, 109f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "ldr d29, [x9], #0x8\n"
"tbz x4, #0, 110f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
- "ld1 { v29.s }[2], [x24]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x9]\n"
"b 110f\n"
"109:" // Height 6: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
- "ldr s29, [x24, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
+ "ldr s29, [x9, #0x0]\n"
"110:" // Height 6: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 113f\n"
"111:" // Height 6: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
- "ldr q29, [x24, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q29, [x9, #0x0]\n"
"b 113f\n"
"112:" // Height 6: no accumulate
"movi v24.16b, #0x0\n"
@@ -1208,34 +1206,34 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"113:" // Height 6: setup done
"mov x7, #0x0\n"
"114:" // Height 6: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
- "ldr x15, [x20, #0x10]\n"
- "ldr x14, [x20, #0x18]\n"
- "ldr x13, [x20, #0x20]\n"
- "ldr x12, [x20, #0x28]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "ldr x12, [x26, #0x28]\n"
"cbnz x7, 116f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
- "add x15, x15, x20, LSL #2\n"
- "add x14, x14, x20, LSL #2\n"
- "add x13, x13, x20, LSL #2\n"
- "add x12, x12, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
+ "add x12, x12, x26, LSL #2\n"
"b 116f\n"
"115:" // Height 6: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
- "add x15, x16, x21, LSL #2\n"
- "add x14, x15, x21, LSL #2\n"
- "add x13, x14, x21, LSL #2\n"
- "add x12, x13, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
"116:" // Height 6: input setup done
"cmp x8, #0x4\n"
"blt 119f\n"
@@ -1279,13 +1277,13 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v24.4s, v10.4s, v0.s[2]\n"
"ldr x26, [x17, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x25, [x16, #0x8]\n"
+ "ldr x11, [x16, #0x8]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x24, [x15, #0x8]\n"
+ "ldr x10, [x15, #0x8]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "ldr x23, [x14, #0x8]\n"
+ "sub x8, x8, #0x4\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "ldr x22, [x13, #0x8]\n"
+ "cmp x8, #0x8\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
"ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1301,26 +1299,26 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v29.4s, v11.4s, v5.s[3]\n"
"ldr d5, [x12, #0x0]\n"
"ldr d11, [x5, #0x30]\n"
- "sub x8, x8, #0x4\n"
- "ldr x21, [x12, #0x8]\n"
- "cmp x8, #0x8\n"
- "ldr x20, [x5, #0x38]\n"
"mov v8.d[1], x9\n"
- "prfm pldl1keep, [x17, #0x80]\n"
+ "ldr x9, [x14, #0x8]\n"
"mov v9.d[1], x28\n"
- "prfm pldl1keep, [x16, #0x80]\n"
+ "ldr x28, [x13, #0x8]\n"
"mov v10.d[1], x27\n"
- "prfm pldl1keep, [x15, #0x80]\n"
+ "ldr x27, [x12, #0x8]\n"
"mov v0.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v1.d[1], x11\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
+ "mov v2.d[1], x10\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
+ "mov v3.d[1], x9\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
+ "mov v4.d[1], x28\n"
"prfm pldl1keep, [x14, #0x80]\n"
- "mov v1.d[1], x25\n"
+ "mov v5.d[1], x27\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v2.d[1], x24\n"
+ "mov v11.d[1], x26\n"
"prfm pldl1keep, [x12, #0x80]\n"
- "mov v3.d[1], x23\n"
- "mov v4.d[1], x22\n"
- "mov v5.d[1], x21\n"
- "mov v11.d[1], x20\n"
"bge 117b\n"
"118:" // Height 6: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -1372,42 +1370,42 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr s18, [x13], #0x4\n"
"ldr s17, [x12], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v22.s[0]\n"
"fmla v25.4s, v16.4s, v21.s[0]\n"
+ "add x5, x5, #0x10\n"
"fmla v26.4s, v16.4s, v20.s[0]\n"
"fmla v27.4s, v16.4s, v19.s[0]\n"
"fmla v28.4s, v16.4s, v18.s[0]\n"
"fmla v29.4s, v16.4s, v17.s[0]\n"
"cbnz x8, 120b\n"
"121:" // Height 6: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 114b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v16.4s\n"
@@ -1419,51 +1417,51 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 125f\n"
"tbz x4, #1, 123f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
- "str d29, [x24], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "str d29, [x9], #0x8\n"
"tbz x4, #0, 124f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
- "st1 { v29.s }[2], [x24]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
+ "st1 { v29.s }[2], [x9]\n"
"b 124f\n"
"123:" // Height 6: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
- "str s29, [x24, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
+ "str s29, [x9, #0x0]\n"
"124:" // Height 6: Partial direct writeback: Done
"b 126f\n"
"125:" // Height 6: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
- "str q29, [x24, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
+ "str q29, [x9, #0x0]\n"
"126:" // Height 6: Writeback done
"subs x4, x4, #0x4\n"
"bgt 107b\n"
"b 170f\n"
"127:" // Height 7
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"128:" // Height 7: Column loop
"cbz x3, 129f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1471,53 +1469,53 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"b 134f\n"
"129:" // Height 7: no bias
"tbz %x[flags], #0, 133f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
"bge 132f\n"
"tbz x4, #1, 130f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d30, [x23], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "ldr d29, [x9], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
"tbz x4, #0, 131f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
- "ld1 { v29.s }[2], [x24]\n"
- "ld1 { v30.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x9]\n"
+ "ld1 { v30.s }[2], [x28]\n"
"b 131f\n"
"130:" // Height 7: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
- "ldr s29, [x24, #0x0]\n"
- "ldr s30, [x23, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
+ "ldr s29, [x9, #0x0]\n"
+ "ldr s30, [x28, #0x0]\n"
"131:" // Height 7: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 134f\n"
"132:" // Height 7: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
- "ldr q29, [x24, #0x0]\n"
- "ldr q30, [x23, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q29, [x9, #0x0]\n"
+ "ldr q30, [x28, #0x0]\n"
"b 134f\n"
"133:" // Height 7: no accumulate
"movi v24.16b, #0x0\n"
@@ -1530,37 +1528,37 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"134:" // Height 7: setup done
"mov x7, #0x0\n"
"135:" // Height 7: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 136f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
- "ldr x15, [x20, #0x10]\n"
- "ldr x14, [x20, #0x18]\n"
- "ldr x13, [x20, #0x20]\n"
- "ldr x12, [x20, #0x28]\n"
- "ldr x11, [x20, #0x30]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "ldr x12, [x26, #0x28]\n"
+ "ldr x11, [x26, #0x30]\n"
"cbnz x7, 137f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
- "add x15, x15, x20, LSL #2\n"
- "add x14, x14, x20, LSL #2\n"
- "add x13, x13, x20, LSL #2\n"
- "add x12, x12, x20, LSL #2\n"
- "add x11, x11, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
+ "add x12, x12, x26, LSL #2\n"
+ "add x11, x11, x26, LSL #2\n"
"b 137f\n"
"136:" // Height 7: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
- "add x15, x16, x21, LSL #2\n"
- "add x14, x15, x21, LSL #2\n"
- "add x13, x14, x21, LSL #2\n"
- "add x12, x13, x21, LSL #2\n"
- "add x11, x12, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
+ "add x11, x12, x27, LSL #2\n"
"137:" // Height 7: input setup done
"cmp x8, #0x4\n"
"blt 140f\n"
@@ -1597,27 +1595,27 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "ldr x10, [x5, #0x8]\n"
+ "ldr x26, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "ldr x9, [x5, #0x18]\n"
+ "ldr x10, [x5, #0x18]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "ldr x28, [x5, #0x28]\n"
+ "ldr x9, [x5, #0x28]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "ldr x27, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x8]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
"ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x26, [x16, #0x8]\n"
+ "ldr x27, [x16, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x25, [x15, #0x8]\n"
+ "sub x8, x8, #0x4\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x24, [x14, #0x8]\n"
+ "cmp x8, #0x8\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "ldr x23, [x13, #0x8]\n"
+ "mov v8.d[1], x26\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "ldr x22, [x12, #0x8]\n"
+ "ldr x26, [x15, #0x8]\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
- "ldr x21, [x11, #0x8]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
"ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1635,27 +1633,27 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v30.4s, v11.4s, v6.s[3]\n"
"ldr d6, [x11, #0x0]\n"
"ldr d11, [x5, #0x30]\n"
- "sub x8, x8, #0x4\n"
- "ldr x20, [x5, #0x38]\n"
- "cmp x8, #0x8\n"
- "mov v8.d[1], x10\n"
- "prfm pldl1keep, [x17, #0x80]\n"
+ "mov v9.d[1], x10\n"
+ "ldr x10, [x14, #0x8]\n"
+ "mov v10.d[1], x9\n"
+ "ldr x9, [x13, #0x8]\n"
+ "mov v0.d[1], x28\n"
+ "ldr x28, [x12, #0x8]\n"
+ "mov v1.d[1], x27\n"
+ "ldr x27, [x11, #0x8]\n"
+ "mov v2.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v3.d[1], x10\n"
"prfm pldl1keep, [x16, #0x80]\n"
- "mov v9.d[1], x9\n"
+ "mov v4.d[1], x9\n"
"prfm pldl1keep, [x15, #0x80]\n"
- "mov v10.d[1], x28\n"
+ "mov v5.d[1], x28\n"
"prfm pldl1keep, [x14, #0x80]\n"
- "mov v0.d[1], x27\n"
+ "mov v6.d[1], x27\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v1.d[1], x26\n"
+ "mov v11.d[1], x26\n"
"prfm pldl1keep, [x12, #0x80]\n"
- "mov v2.d[1], x25\n"
"prfm pldl1keep, [x11, #0x80]\n"
- "mov v3.d[1], x24\n"
- "mov v4.d[1], x23\n"
- "mov v5.d[1], x22\n"
- "mov v6.d[1], x21\n"
- "mov v11.d[1], x20\n"
"bge 138b\n"
"139:" // Height 7: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -1714,9 +1712,9 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr s18, [x12], #0x4\n"
"ldr s17, [x11], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v23.s[0]\n"
"fmla v25.4s, v16.4s, v22.s[0]\n"
+ "add x5, x5, #0x10\n"
"fmla v26.4s, v16.4s, v21.s[0]\n"
"fmla v27.4s, v16.4s, v20.s[0]\n"
"fmla v28.4s, v16.4s, v19.s[0]\n"
@@ -1724,36 +1722,36 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v30.4s, v16.4s, v17.s[0]\n"
"cbnz x8, 141b\n"
"142:" // Height 7: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 135b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"prfm pstl1keep, [x28, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 143f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v16.4s\n"
@@ -1766,59 +1764,58 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 146f\n"
"tbz x4, #1, 144f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
- "str d29, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "str d29, [x9], #0x8\n"
+ "str d30, [x28], #0x8\n"
"tbz x4, #0, 145f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
- "st1 { v29.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
+ "st1 { v29.s }[2], [x9]\n"
+ "st1 { v30.s }[2], [x28]\n"
"b 145f\n"
"144:" // Height 7: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
- "str s29, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
+ "str s29, [x9, #0x0]\n"
+ "str s30, [x28, #0x0]\n"
"145:" // Height 7: Partial direct writeback: Done
"b 147f\n"
"146:" // Height 7: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
- "str q29, [x24, #0x0]\n"
- "str q30, [x23, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
+ "str q29, [x9, #0x0]\n"
+ "str q30, [x28, #0x0]\n"
"147:" // Height 7: Writeback done
"subs x4, x4, #0x4\n"
"bgt 128b\n"
"b 170f\n"
"148:" // Height 8
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "mov x20, #0x20\n"
- "ldr x6, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "ldr x3, [%x[args_ptr], %[offsetof_bias]]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x26, #0x20\n"
+ "mov x3, %x[bias]\n"
"ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
- "madd x20, x21, x20, x6\n"
"ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
+ "madd %x[output_ptr], x27, x26, %x[output_ptr]\n"
"149:" // Height 8: Column loop
"cbz x3, 150f\n"
"ldr q24, [x3, #0x0]\n"
- "add x3, x3, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1827,58 +1824,58 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"b 155f\n"
"150:" // Height 8: no bias
"tbz %x[flags], #0, 154f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
"cmp x4, #0x4\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x27, x28, x26, LSL #2\n"
"bge 153f\n"
"tbz x4, #1, 151f\n"
"ldr d24, [x6], #0x8\n"
- "mov x20, #0x8\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d30, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "ldr d29, [x9], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "ldr d31, [x27], #0x8\n"
"tbz x4, #0, 152f\n"
"ld1 { v24.s }[2], [x6]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
- "ld1 { v29.s }[2], [x24]\n"
- "ld1 { v30.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x9]\n"
+ "ld1 { v30.s }[2], [x28]\n"
+ "ld1 { v31.s }[2], [x27]\n"
"b 152f\n"
"151:" // Height 8: Partial accumulate: partial_1_0
"ldr s24, [x6, #0x0]\n"
- "mov x20, #0x0\n"
- "ldr s25, [x28, #0x0]\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
- "ldr s29, [x24, #0x0]\n"
- "ldr s30, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
+ "ldr s29, [x9, #0x0]\n"
+ "ldr s30, [x28, #0x0]\n"
+ "ldr s31, [x27, #0x0]\n"
"152:" // Height 8: Partial accumulate: Done
- "sub x6, x6, x20\n"
+ "sub x6, x6, x26\n"
"b 155f\n"
"153:" // Height 8: full accumulate
"ldr q24, [x6, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
- "ldr q29, [x24, #0x0]\n"
- "ldr q30, [x23, #0x0]\n"
- "ldr q31, [x22, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q29, [x9, #0x0]\n"
+ "ldr q30, [x28, #0x0]\n"
+ "ldr q31, [x27, #0x0]\n"
"b 155f\n"
"154:" // Height 8: no accumulate
"movi v24.16b, #0x0\n"
@@ -1892,40 +1889,40 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"155:" // Height 8: setup done
"mov x7, #0x0\n"
"156:" // Height 8: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w8, [x20, x7, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 157f\n"
- "ldr x20, [%x[input_ptr], x7, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x17, [x20, #0x0]\n"
- "ldr x16, [x20, #0x8]\n"
- "ldr x15, [x20, #0x10]\n"
- "ldr x14, [x20, #0x18]\n"
- "ldr x13, [x20, #0x20]\n"
- "ldr x12, [x20, #0x28]\n"
- "ldr x11, [x20, #0x30]\n"
- "ldr x9, [x20, #0x38]\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "ldr x12, [x26, #0x28]\n"
+ "ldr x11, [x26, #0x30]\n"
+ "ldr x27, [x26, #0x38]\n"
"cbnz x7, 158f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x17, x17, x20, LSL #2\n"
- "add x16, x16, x20, LSL #2\n"
- "add x15, x15, x20, LSL #2\n"
- "add x14, x14, x20, LSL #2\n"
- "add x13, x13, x20, LSL #2\n"
- "add x12, x12, x20, LSL #2\n"
- "add x11, x11, x20, LSL #2\n"
- "add x9, x9, x20, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
+ "add x12, x12, x26, LSL #2\n"
+ "add x11, x11, x26, LSL #2\n"
+ "add x27, x27, x26, LSL #2\n"
"b 158f\n"
"157:" // Height 8: setup direct input
"mov x17, %x[input_ptr]\n"
- "add x16, x17, x21, LSL #2\n"
- "add x15, x16, x21, LSL #2\n"
- "add x14, x15, x21, LSL #2\n"
- "add x13, x14, x21, LSL #2\n"
- "add x12, x13, x21, LSL #2\n"
- "add x11, x12, x21, LSL #2\n"
- "add x9, x11, x21, LSL #2\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
+ "add x11, x12, x27, LSL #2\n"
+ "add x27, x11, x27, LSL #2\n"
"158:" // Height 8: input setup done
"cmp x8, #0x4\n"
"blt 161f\n"
@@ -1937,7 +1934,7 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr q4, [x13, #0x0]\n"
"ldr q5, [x12, #0x0]\n"
"ldr q6, [x11, #0x0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q7, [x27, #0x0]\n"
"ldr q8, [x5, #0x0]\n"
"ldr q9, [x5, #0x10]\n"
"ldr q10, [x5, #0x20]\n"
@@ -1959,37 +1956,37 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v30.4s, v8.4s, v6.s[0]\n"
"add x11, x11, #0x10\n"
"fmla v31.4s, v8.4s, v7.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"add x5, x5, #0x40\n"
"ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "ldr x22, [x5, #0x8]\n"
+ "ldr x26, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "ldr x28, [x5, #0x18]\n"
+ "sub x8, x8, #0x4\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "ldr x21, [x5, #0x28]\n"
+ "cmp x8, #0x8\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "ldr x20, [x17, #0x8]\n"
+ "mov v8.d[1], x26\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
- "ldr x27, [x16, #0x8]\n"
+ "ldr x26, [x5, #0x18]\n"
"fmla v31.4s, v9.4s, v7.s[1]\n"
"ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x26, [x15, #0x8]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x25, [x14, #0x8]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x24, [x13, #0x8]\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "ldr x23, [x12, #0x8]\n"
+ "mov v9.d[1], x26\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "sub x8, x8, #0x4\n"
+ "ldr x26, [x5, #0x28]\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
- "cmp x8, #0x8\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
- "mov v8.d[1], x22\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v31.4s, v10.4s, v7.s[2]\n"
"ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -2006,31 +2003,31 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr d5, [x12, #0x0]\n"
"fmla v30.4s, v11.4s, v6.s[3]\n"
"ldr d6, [x11, #0x0]\n"
- "ldr x22, [x11, #0x8]\n"
"fmla v31.4s, v11.4s, v7.s[3]\n"
- "ldr d7, [x9, #0x0]\n"
- "mov v9.d[1], x28\n"
+ "ldr d7, [x27, #0x0]\n"
"ldr d11, [x5, #0x30]\n"
- "mov v10.d[1], x21\n"
- "ldr x21, [x9, #0x8]\n"
- "mov v0.d[1], x20\n"
- "ldr x20, [x5, #0x38]\n"
- "mov v1.d[1], x27\n"
- "prfm pldl1keep, [x17, #0x80]\n"
+ "mov v10.d[1], x26\n"
+ "ldr x26, [x17, #0x8]\n"
+ "mov v0.d[1], x26\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v1.d[1], x26\n"
+ "ldr x26, [x15, #0x8]\n"
"mov v2.d[1], x26\n"
- "prfm pldl1keep, [x16, #0x80]\n"
- "mov v3.d[1], x25\n"
- "prfm pldl1keep, [x15, #0x80]\n"
- "mov v4.d[1], x24\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "mov v5.d[1], x23\n"
- "prfm pldl1keep, [x13, #0x80]\n"
- "mov v6.d[1], x22\n"
+ "ldr x26, [x14, #0x8]\n"
+ "mov v3.d[1], x26\n"
+ "ldr x26, [x13, #0x8]\n"
+ "mov v4.d[1], x26\n"
+ "ldr x26, [x12, #0x8]\n"
+ "mov v5.d[1], x26\n"
+ "ldr x26, [x11, #0x8]\n"
+ "mov v6.d[1], x26\n"
+ "ldr x26, [x27, #0x8]\n"
+ "mov v7.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v11.d[1], x26\n"
"prfm pldl1keep, [x12, #0x80]\n"
- "mov v7.d[1], x21\n"
"prfm pldl1keep, [x11, #0x80]\n"
- "mov v11.d[1], x20\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 159b\n"
"160:" // Height 8: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -2048,7 +2045,7 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v30.4s, v8.4s, v6.s[0]\n"
"add x11, x11, #0x10\n"
"fmla v31.4s, v8.4s, v7.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
@@ -2066,7 +2063,7 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v31.4s, v9.4s, v7.s[1]\n"
"prfm pldl1keep, [x11, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
"add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
@@ -2094,11 +2091,11 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"ldr s20, [x13], #0x4\n"
"ldr s19, [x12], #0x4\n"
"ldr s18, [x11], #0x4\n"
- "ldr s17, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
"ldr q16, [x5, #0x0]\n"
- "add x5, x5, #0x10\n"
"fmla v24.4s, v16.4s, v0.s[0]\n"
"fmla v25.4s, v16.4s, v23.s[0]\n"
+ "add x5, x5, #0x10\n"
"fmla v26.4s, v16.4s, v22.s[0]\n"
"fmla v27.4s, v16.4s, v21.s[0]\n"
"fmla v28.4s, v16.4s, v20.s[0]\n"
@@ -2107,39 +2104,39 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v31.4s, v16.4s, v17.s[0]\n"
"cbnz x8, 162b\n"
"163:" // Height 8: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x7, x7, #0x1\n"
- "cmp x7, x20\n"
+ "cmp x7, x26\n"
"bne 156b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
+ "add x27, x28, x26, LSL #2\n"
"prfm pstl1keep, [x6, #0x0]\n"
- "add x28, x6, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"prfm pstl1keep, [x28, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x24, x25, x20, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 164f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
- "ld1r { v16.4s }, [x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmin v31.4s, v31.4s, v16.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmax v24.4s, v24.4s, v16.4s\n"
"fmax v25.4s, v25.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v16.4s\n"
@@ -2153,62 +2150,62 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"bge 167f\n"
"tbz x4, #1, 165f\n"
"str d24, [x6], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
- "str d29, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "str d29, [x9], #0x8\n"
+ "str d30, [x28], #0x8\n"
+ "str d31, [x27], #0x8\n"
"tbz x4, #0, 166f\n"
"st1 { v24.s }[2], [x6]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
- "st1 { v29.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
+ "st1 { v29.s }[2], [x9]\n"
+ "st1 { v30.s }[2], [x28]\n"
+ "st1 { v31.s }[2], [x27]\n"
"b 166f\n"
"165:" // Height 8: Partial direct writeback: partial_1_0
"str s24, [x6, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
- "str s29, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
+ "str s29, [x9, #0x0]\n"
+ "str s30, [x28, #0x0]\n"
+ "str s31, [x27, #0x0]\n"
"166:" // Height 8: Partial direct writeback: Done
"b 168f\n"
"167:" // Height 8: Full writeback
"str q24, [x6, #0x0]\n"
"add x6, x6, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
- "str q29, [x24, #0x0]\n"
- "str q30, [x23, #0x0]\n"
- "str q31, [x22, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
+ "str q29, [x9, #0x0]\n"
+ "str q30, [x28, #0x0]\n"
+ "str q31, [x27, #0x0]\n"
"168:" // Height 8: Writeback done
"subs x4, x4, #0x4\n"
"bgt 149b\n"
"subs %x[M], %x[M], #0x8\n"
"beq 170f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 169f\n"
- "add x21, x21, #0x8\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "add x27, x27, #0x8\n"
+ "str x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
"169:" // Update direct input
- "mov x20, #0x20\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+ "mov x26, #0x20\n"
+ "madd %x[input_ptr], x26, x27, %x[input_ptr]\n"
"b 1b\n"
"170:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
index 6401b01607..36356dfb35 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -47,19 +47,18 @@ void a64_hybrid_fp32_mla_8x4 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void a64_hybrid_fp32_mla_8x4 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -106,10 +104,10 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp %x[M], #0x2\n"
"bgt 43f\n"
"beq 22f\n"
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x14, 3f\n"
"ldr q24, [x14, #0x0]\n"
@@ -140,8 +138,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"9:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 10f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -164,37 +162,37 @@ void a64_hybrid_fp32_mla_8x4 (
"blt 13f\n"
"12:" // Height 1: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"sub x9, x9, #0x4\n"
"add x28, x28, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
"cmp x9, #0x8\n"
"add x12, x12, #0x40\n"
- "prfm pldl1keep, [x28, #0x80]\n"
"ldr q8, [x12, #0x0]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"ldr q9, [x12, #0x10]\n"
- "fmla v24.4s, v10.4s, v0.s[2]\n"
"ldr q10, [x12, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr q0, [x28, #0x0]\n"
"ldr q11, [x12, #0x30]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"bge 12b\n"
"13:" // Height 1: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x28, x28, #0x10\n"
"sub x9, x9, #0x4\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x12, x12, #0x40\n"
"14:" // Height 1: Multiply loop: Main loop skip
"cbz x9, 16f\n"
"15:" // Height 1: Multiply loop: Odd block loop
"ldr s17, [x28], #0x4\n"
"ldr q16, [x12, #0x0]\n"
"sub x9, x9, #0x1\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v17.s[0]\n"
+ "add x12, x12, #0x10\n"
"cbnz x9, 15b\n"
"16:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -203,9 +201,9 @@ void a64_hybrid_fp32_mla_8x4 (
"bne 9b\n"
"prfm pstl1keep, [x11, #0x0]\n"
"tbz %x[flags], #1, 17f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmax v24.4s, v24.4s, v16.4s\n"
@@ -229,40 +227,40 @@ void a64_hybrid_fp32_mla_8x4 (
"bgt 2b\n"
"b 170f\n"
"22:" // Height 2
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"23:" // Height 2: Column loop
"cbz x14, 24f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"b 29f\n"
"24:" // Height 2: no bias
"tbz %x[flags], #0, 28f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"bge 27f\n"
"tbz x13, #1, 25f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
"tbz x13, #0, 26f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
+ "ld1 { v25.s }[2], [x27]\n"
"b 26f\n"
"25:" // Height 2: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
"26:" // Height 2: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 29f\n"
"27:" // Height 2: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
"b 29f\n"
"28:" // Height 2: no accumulate
"movi v24.16b, #0x0\n"
@@ -271,8 +269,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"30:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -302,37 +300,37 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v25.4s, v8.4s, v1.s[0]\n"
"sub x9, x9, #0x4\n"
"add x28, x28, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
"add x27, x27, #0x10\n"
"cmp x9, #0x8\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
"add x12, x12, #0x40\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"ldr q8, [x12, #0x0]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
- "fmla v25.4s, v9.4s, v1.s[1]\n"
"ldr q9, [x12, #0x10]\n"
- "fmla v24.4s, v10.4s, v0.s[2]\n"
- "fmla v25.4s, v10.4s, v1.s[2]\n"
"ldr q10, [x12, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [x28, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q0, [x28, #0x0]\n"
"ldr q1, [x27, #0x0]\n"
"ldr q11, [x12, #0x30]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 33b\n"
"34:" // Height 2: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
"add x28, x28, #0x10\n"
"add x27, x27, #0x10\n"
- "sub x9, x9, #0x4\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "sub x9, x9, #0x4\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"35:" // Height 2: Multiply loop: Main loop skip
@@ -342,9 +340,9 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr s17, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v18.s[0]\n"
"fmla v25.4s, v16.4s, v17.s[0]\n"
+ "add x12, x12, #0x10\n"
"cbnz x9, 36b\n"
"37:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -352,13 +350,13 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x27, x11, x20, LSL #2\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -369,65 +367,65 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 41f\n"
"tbz x13, #1, 39f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
+ "str d25, [x27], #0x8\n"
"tbz x13, #0, 40f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
+ "st1 { v25.s }[2], [x27]\n"
"b 40f\n"
"39:" // Height 2: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
"40:" // Height 2: Partial direct writeback: Done
"b 42f\n"
"41:" // Height 2: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
"42:" // Height 2: Writeback done
"subs x13, x13, #0x4\n"
"bgt 23b\n"
"b 170f\n"
"43:" // Height 3
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"44:" // Height 3: Column loop
"cbz x14, 45f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"b 50f\n"
"45:" // Height 3: no bias
"tbz %x[flags], #0, 49f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x27, x11, x20, LSL #2\n"
"cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x26, x27, x20, LSL #2\n"
"bge 48f\n"
"tbz x13, #1, 46f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
- "ldr d26, [x27], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
"tbz x13, #0, 47f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
"b 47f\n"
"46:" // Height 3: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s26, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
"47:" // Height 3: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 50f\n"
"48:" // Height 3: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
"b 50f\n"
"49:" // Height 3: no accumulate
"movi v24.16b, #0x0\n"
@@ -437,8 +435,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"51:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 52f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -473,17 +471,14 @@ void a64_hybrid_fp32_mla_8x4 (
"sub x9, x9, #0x4\n"
"add x28, x28, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
"cmp x9, #0x8\n"
"add x12, x12, #0x40\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"ldr q8, [x12, #0x0]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
- "fmla v25.4s, v9.4s, v1.s[1]\n"
- "fmla v26.4s, v9.4s, v2.s[1]\n"
"ldr q9, [x12, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
@@ -491,11 +486,14 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr q10, [x12, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr q0, [x28, #0x0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"ldr q1, [x27, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"ldr q2, [x26, #0x0]\n"
"ldr q11, [x12, #0x30]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 54b\n"
"55:" // Height 3: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -503,17 +501,17 @@ void a64_hybrid_fp32_mla_8x4 (
"add x28, x28, #0x10\n"
"add x27, x27, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x26, x26, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
"sub x9, x9, #0x4\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -526,10 +524,10 @@ void a64_hybrid_fp32_mla_8x4 (
"sub x9, x9, #0x1\n"
"ldr s17, [x26], #0x4\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v19.s[0]\n"
"fmla v25.4s, v16.4s, v18.s[0]\n"
"fmla v26.4s, v16.4s, v17.s[0]\n"
+ "add x12, x12, #0x10\n"
"cbnz x9, 57b\n"
"58:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -537,15 +535,15 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 51b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x27, x11, x20, LSL #2\n"
+ "add x26, x27, x20, LSL #2\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x27, x28, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
"tbz %x[flags], #1, 59f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -558,75 +556,75 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 62f\n"
"tbz x13, #1, 60f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
"tbz x13, #0, 61f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
"b 61f\n"
"60:" // Height 3: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
"61:" // Height 3: Partial direct writeback: Done
"b 63f\n"
"62:" // Height 3: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
"63:" // Height 3: Writeback done
"subs x13, x13, #0x4\n"
"bgt 44b\n"
"b 170f\n"
"64:" // Height 4
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"65:" // Height 4: Column loop
"cbz x14, 66f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"mov v27.16b, v24.16b\n"
"b 71f\n"
"66:" // Height 4: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
+ "cmp x13, #0x4\n"
+ "add x25, x26, x20, LSL #2\n"
"bge 69f\n"
"tbz x13, #1, 67f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
"tbz x13, #0, 68f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
"b 68f\n"
"67:" // Height 4: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
"68:" // Height 4: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 71f\n"
"69:" // Height 4: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
"b 71f\n"
"70:" // Height 4: no accumulate
"movi v24.16b, #0x0\n"
@@ -637,8 +635,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"72:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -680,24 +678,23 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v27.4s, v8.4s, v3.s[0]\n"
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x9, #0x8\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "ldr q8, [x12, #0x0]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x9, #0x8\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"ldr q10, [x12, #0x20]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr q0, [x28, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -707,6 +704,7 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v27.4s, v11.4s, v3.s[3]\n"
"ldr q3, [x25, #0x0]\n"
"ldr q11, [x12, #0x30]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"bge 75b\n"
"76:" // Height 4: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
@@ -717,18 +715,18 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v27.4s, v8.4s, v3.s[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "sub x9, x9, #0x4\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "sub x9, x9, #0x4\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -744,9 +742,9 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr s18, [x26], #0x4\n"
"ldr s17, [x25], #0x4\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v20.s[0]\n"
"fmla v25.4s, v16.4s, v19.s[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v26.4s, v16.4s, v18.s[0]\n"
"fmla v27.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 78b\n"
@@ -756,17 +754,17 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 72b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x27, x11, x20, LSL #2\n"
+ "add x26, x27, x20, LSL #2\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x25, x26, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
"prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -781,85 +779,85 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 83f\n"
"tbz x13, #1, 81f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
"tbz x13, #0, 82f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
"b 82f\n"
"81:" // Height 4: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
"82:" // Height 4: Partial direct writeback: Done
"b 84f\n"
"83:" // Height 4: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
"84:" // Height 4: Writeback done
"subs x13, x13, #0x4\n"
"bgt 65b\n"
"b 170f\n"
"85:" // Height 5
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"86:" // Height 5: Column loop
"cbz x14, 87f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"b 92f\n"
"87:" // Height 5: no bias
"tbz %x[flags], #0, 91f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
+ "cmp x13, #0x4\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 90f\n"
"tbz x13, #1, 88f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
"tbz x13, #0, 89f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
"b 89f\n"
"88:" // Height 5: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
"89:" // Height 5: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 92f\n"
"90:" // Height 5: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
"b 92f\n"
"91:" // Height 5: no accumulate
"movi v24.16b, #0x0\n"
@@ -871,8 +869,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"93:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 94f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -919,30 +917,30 @@ void a64_hybrid_fp32_mla_8x4 (
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
"cmp x9, #0x8\n"
"add x12, x12, #0x40\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"ldr q8, [x12, #0x0]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
- "fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
"ldr q9, [x12, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
"ldr q10, [x12, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr q0, [x28, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"ldr q1, [x27, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
@@ -963,21 +961,21 @@ void a64_hybrid_fp32_mla_8x4 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x24, x24, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"sub x9, x9, #0x4\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -996,11 +994,11 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr s18, [x25], #0x4\n"
"ldr s17, [x24], #0x4\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v21.s[0]\n"
"fmla v25.4s, v16.4s, v20.s[0]\n"
"fmla v26.4s, v16.4s, v19.s[0]\n"
"fmla v27.4s, v16.4s, v18.s[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v28.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 99b\n"
"100:" // Height 5: Multiply loop: No odd multiplies
@@ -1009,19 +1007,19 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 93b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x27, x11, x20, LSL #2\n"
+ "add x26, x27, x20, LSL #2\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x20, LSL #2\n"
"prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 101f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -1038,47 +1036,47 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 104f\n"
"tbz x13, #1, 102f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
"tbz x13, #0, 103f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
"b 103f\n"
"102:" // Height 5: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
"103:" // Height 5: Partial direct writeback: Done
"b 105f\n"
"104:" // Height 5: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
"105:" // Height 5: Writeback done
"subs x13, x13, #0x4\n"
"bgt 86b\n"
"b 170f\n"
"106:" // Height 6
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"107:" // Height 6: Column loop
"cbz x14, 108f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1086,47 +1084,47 @@ void a64_hybrid_fp32_mla_8x4 (
"108:" // Height 6: no bias
"tbz %x[flags], #0, 112f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x13, #0x4\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 111f\n"
"tbz x13, #1, 109f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d29, [x23], #0x8\n"
"tbz x13, #0, 110f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
- "ld1 { v29.s }[2], [x24]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "ld1 { v29.s }[2], [x23]\n"
"b 110f\n"
"109:" // Height 6: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
- "ldr s29, [x24, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "ldr s29, [x23, #0x0]\n"
"110:" // Height 6: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 113f\n"
"111:" // Height 6: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
- "ldr q29, [x24, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "ldr q29, [x23, #0x0]\n"
"b 113f\n"
"112:" // Height 6: no accumulate
"movi v24.16b, #0x0\n"
@@ -1139,8 +1137,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"114:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1194,30 +1192,30 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v29.4s, v8.4s, v5.s[0]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "cmp x9, #0x8\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x12, x12, #0x40\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q8, [x12, #0x0]\n"
+ "add x23, x23, #0x10\n"
+ "cmp x9, #0x8\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q8, [x12, #0x0]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
"ldr q9, [x12, #0x10]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
"ldr q10, [x12, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"ldr q0, [x28, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -1245,22 +1243,22 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v29.4s, v8.4s, v5.s[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "sub x9, x9, #0x4\n"
- "add x12, x12, #0x40\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x9, x9, #0x4\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -1282,9 +1280,9 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr s18, [x24], #0x4\n"
"ldr s17, [x23], #0x4\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v22.s[0]\n"
"fmla v25.4s, v16.4s, v21.s[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v26.4s, v16.4s, v20.s[0]\n"
"fmla v27.4s, v16.4s, v19.s[0]\n"
"fmla v28.4s, v16.4s, v18.s[0]\n"
@@ -1296,21 +1294,21 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 114b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x23, x24, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -1329,51 +1327,51 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 125f\n"
"tbz x13, #1, 123f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
- "str d29, [x24], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "str d29, [x23], #0x8\n"
"tbz x13, #0, 124f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
- "st1 { v29.s }[2], [x24]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x23]\n"
"b 124f\n"
"123:" // Height 6: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
- "str s29, [x24, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "str s29, [x23, #0x0]\n"
"124:" // Height 6: Partial direct writeback: Done
"b 126f\n"
"125:" // Height 6: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
- "str q29, [x24, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "str q29, [x23, #0x0]\n"
"126:" // Height 6: Writeback done
"subs x13, x13, #0x4\n"
"bgt 107b\n"
"b 170f\n"
"127:" // Height 7
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"128:" // Height 7: Column loop
"cbz x14, 129f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1382,52 +1380,52 @@ void a64_hybrid_fp32_mla_8x4 (
"129:" // Height 7: no bias
"tbz %x[flags], #0, 133f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x13, #0x4\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 132f\n"
"tbz x13, #1, 130f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d30, [x23], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d29, [x23], #0x8\n"
+ "ldr d30, [x22], #0x8\n"
"tbz x13, #0, 131f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
- "ld1 { v29.s }[2], [x24]\n"
- "ld1 { v30.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "ld1 { v29.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x22]\n"
"b 131f\n"
"130:" // Height 7: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
- "ldr s29, [x24, #0x0]\n"
- "ldr s30, [x23, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "ldr s29, [x23, #0x0]\n"
+ "ldr s30, [x22, #0x0]\n"
"131:" // Height 7: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 134f\n"
"132:" // Height 7: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
- "ldr q29, [x24, #0x0]\n"
- "ldr q30, [x23, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "ldr q29, [x23, #0x0]\n"
+ "ldr q30, [x22, #0x0]\n"
"b 134f\n"
"133:" // Height 7: no accumulate
"movi v24.16b, #0x0\n"
@@ -1441,8 +1439,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"135:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 136f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1501,32 +1499,32 @@ void a64_hybrid_fp32_mla_8x4 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
"cmp x9, #0x8\n"
"add x12, x12, #0x40\n"
"ldr q8, [x12, #0x0]\n"
- "fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
"ldr q9, [x12, #0x10]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
"ldr q10, [x12, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1559,25 +1557,25 @@ void a64_hybrid_fp32_mla_8x4 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x22, x22, #0x10\n"
"sub x9, x9, #0x4\n"
- "add x12, x12, #0x40\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v28.4s, v9.4s, v4.s[1]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v30.4s, v9.4s, v6.s[1]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -1602,11 +1600,11 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr s18, [x23], #0x4\n"
"ldr s17, [x22], #0x4\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v23.s[0]\n"
"fmla v25.4s, v16.4s, v22.s[0]\n"
"fmla v26.4s, v16.4s, v21.s[0]\n"
"fmla v27.4s, v16.4s, v20.s[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v28.4s, v16.4s, v19.s[0]\n"
"fmla v29.4s, v16.4s, v18.s[0]\n"
"fmla v30.4s, v16.4s, v17.s[0]\n"
@@ -1617,23 +1615,23 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 135b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 143f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -1654,59 +1652,58 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 146f\n"
"tbz x13, #1, 144f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
- "str d29, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "str d29, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
"tbz x13, #0, 145f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
- "st1 { v29.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
"b 145f\n"
"144:" // Height 7: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
- "str s29, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "str s29, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
"145:" // Height 7: Partial direct writeback: Done
"b 147f\n"
"146:" // Height 7: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
- "str q29, [x24, #0x0]\n"
- "str q30, [x23, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "str q29, [x23, #0x0]\n"
+ "str q30, [x22, #0x0]\n"
"147:" // Height 7: Writeback done
"subs x13, x13, #0x4\n"
"bgt 128b\n"
"b 170f\n"
"148:" // Height 8
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x20\n"
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x11\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"149:" // Height 8: Column loop
"cbz x14, 150f\n"
"ldr q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x14, x14, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1716,57 +1713,57 @@ void a64_hybrid_fp32_mla_8x4 (
"150:" // Height 8: no bias
"tbz %x[flags], #0, 154f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x13, #0x4\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x13, #0x4\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 153f\n"
"tbz x13, #1, 151f\n"
"ldr d24, [x11], #0x8\n"
- "ldr d25, [x28], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
"mov x20, #0x8\n"
- "ldr d26, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d30, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d29, [x23], #0x8\n"
+ "ldr d30, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x13, #0, 152f\n"
"ld1 { v24.s }[2], [x11]\n"
- "ld1 { v25.s }[2], [x28]\n"
- "ld1 { v26.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v28.s }[2], [x25]\n"
- "ld1 { v29.s }[2], [x24]\n"
- "ld1 { v30.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "ld1 { v29.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 152f\n"
"151:" // Height 8: Partial accumulate: partial_1_0
"ldr s24, [x11, #0x0]\n"
- "ldr s25, [x28, #0x0]\n"
+ "ldr s25, [x27, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s26, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s28, [x25, #0x0]\n"
- "ldr s29, [x24, #0x0]\n"
- "ldr s30, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "ldr s29, [x23, #0x0]\n"
+ "ldr s30, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"152:" // Height 8: Partial accumulate: Done
"sub x11, x11, x20\n"
"b 155f\n"
"153:" // Height 8: full accumulate
"ldr q24, [x11, #0x0]\n"
- "ldr q25, [x28, #0x0]\n"
- "ldr q26, [x27, #0x0]\n"
- "ldr q27, [x26, #0x0]\n"
- "ldr q28, [x25, #0x0]\n"
- "ldr q29, [x24, #0x0]\n"
- "ldr q30, [x23, #0x0]\n"
- "ldr q31, [x22, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "ldr q29, [x23, #0x0]\n"
+ "ldr q30, [x22, #0x0]\n"
+ "ldr q31, [x21, #0x0]\n"
"b 155f\n"
"154:" // Height 8: no accumulate
"movi v24.16b, #0x0\n"
@@ -1781,8 +1778,8 @@ void a64_hybrid_fp32_mla_8x4 (
"mov x10, #0x0\n"
"156:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 157f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1855,10 +1852,10 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
"add x12, x12, #0x40\n"
- "prfm pldl1keep, [x28, #0x80]\n"
"ldr q8, [x12, #0x0]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"prfm pldl1keep, [x27, #0x80]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
"fmla v31.4s, v9.4s, v7.s[1]\n"
@@ -1915,24 +1912,24 @@ void a64_hybrid_fp32_mla_8x4 (
"add x21, x21, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "sub x9, x9, #0x4\n"
"prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "sub x9, x9, #0x4\n"
- "add x12, x12, #0x40\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
"fmla v31.4s, v9.4s, v7.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"prfm pldl1keep, [x21, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -1960,9 +1957,9 @@ void a64_hybrid_fp32_mla_8x4 (
"ldr s18, [x22], #0x4\n"
"ldr s17, [x21], #0x4\n"
"ldr q16, [x12, #0x0]\n"
- "add x12, x12, #0x10\n"
"fmla v24.4s, v16.4s, v0.s[0]\n"
"fmla v25.4s, v16.4s, v23.s[0]\n"
+ "add x12, x12, #0x10\n"
"fmla v26.4s, v16.4s, v22.s[0]\n"
"fmla v27.4s, v16.4s, v21.s[0]\n"
"fmla v28.4s, v16.4s, v20.s[0]\n"
@@ -1976,25 +1973,25 @@ void a64_hybrid_fp32_mla_8x4 (
"cmp x10, x20\n"
"bne 156b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "add x28, x11, x20, LSL #2\n"
- "prfm pstl1keep, [x28, #0x0]\n"
- "add x27, x28, x20, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 164f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x21]\n"
"ld1r { v16.4s }, [x20]\n"
"fmin v24.4s, v24.4s, v17.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
@@ -2017,44 +2014,44 @@ void a64_hybrid_fp32_mla_8x4 (
"bge 167f\n"
"tbz x13, #1, 165f\n"
"str d24, [x11], #0x8\n"
- "str d25, [x28], #0x8\n"
- "str d26, [x27], #0x8\n"
- "str d27, [x26], #0x8\n"
- "str d28, [x25], #0x8\n"
- "str d29, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d31, [x22], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "str d29, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
"tbz x13, #0, 166f\n"
"st1 { v24.s }[2], [x11]\n"
- "st1 { v25.s }[2], [x28]\n"
- "st1 { v26.s }[2], [x27]\n"
- "st1 { v27.s }[2], [x26]\n"
- "st1 { v28.s }[2], [x25]\n"
- "st1 { v29.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
"b 166f\n"
"165:" // Height 8: Partial direct writeback: partial_1_0
"str s24, [x11, #0x0]\n"
- "str s25, [x28, #0x0]\n"
- "str s26, [x27, #0x0]\n"
- "str s27, [x26, #0x0]\n"
- "str s28, [x25, #0x0]\n"
- "str s29, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s31, [x22, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "str s29, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
"166:" // Height 8: Partial direct writeback: Done
"b 168f\n"
"167:" // Height 8: Full writeback
"str q24, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q25, [x28, #0x0]\n"
- "str q26, [x27, #0x0]\n"
- "str q27, [x26, #0x0]\n"
- "str q28, [x25, #0x0]\n"
- "str q29, [x24, #0x0]\n"
- "str q30, [x23, #0x0]\n"
- "str q31, [x22, #0x0]\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "str q29, [x23, #0x0]\n"
+ "str q30, [x22, #0x0]\n"
+ "str q31, [x21, #0x0]\n"
"168:" // Height 8: Writeback done
"subs x13, x13, #0x4\n"
"bgt 149b\n"
@@ -2070,8 +2067,8 @@ void a64_hybrid_fp32_mla_8x4 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"170:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index 453ef4888f..e04820cc10 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 24, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 4, 24, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
index 39a34898d7..d36059d6a1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -48,19 +48,18 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -101,27 +99,27 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"cmp %x[M], #0x2\n"
"bgt 87f\n"
"beq 44f\n"
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x10, 3f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x10, x10, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -255,8 +253,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"20:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 21f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -279,9 +277,6 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"blt 24f\n"
"23:" // Height 1: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x25, #0x8\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"ldr q24, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
@@ -298,42 +293,45 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q22, [x28, #0xa0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x28, #0xb0]\n"
- "add x28, x28, #0xc0\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x8\n"
".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
+ "add x28, x28, #0xc0\n"
"ldr q4, [x28, #0x0]\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x28, #0x10]\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x28, #0x20]\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
- "ld1 { v0.4s }, [x24], #0x10\n"
"ldr q7, [x28, #0x30]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1 { v0.4s }, [x24], #0x10\n"
"bge 23b\n"
"24:" // Height 1: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"ldr q23, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q22, [x28, #0x50]\n"
+ "ldr q25, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"ldr q21, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"ldr q24, [x28, #0x70]\n"
".inst 0x6e57ec0a // bfmmla v10.4s, v0.8h, v23.8h\n"
"ldr q23, [x28, #0x80]\n"
- ".inst 0x6e56ec10 // bfmmla v16.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e59ec10 // bfmmla v16.4s, v0.8h, v25.8h\n"
"ldr q22, [x28, #0x90]\n"
".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
"ldr q21, [x28, #0xa0]\n"
".inst 0x6e58ec11 // bfmmla v17.4s, v0.8h, v24.8h\n"
- "ldr q3, [x28, #0xb0]\n"
- "add x28, x28, #0xc0\n"
+ "ldr q5, [x28, #0xb0]\n"
+ "sub x25, x25, #0x4\n"
".inst 0x6e57ec0c // bfmmla v12.4s, v0.8h, v23.8h\n"
".inst 0x6e56ec12 // bfmmla v18.4s, v0.8h, v22.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x28, x28, #0xc0\n"
".inst 0x6e55ec0d // bfmmla v13.4s, v0.8h, v21.8h\n"
- ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e45ec13 // bfmmla v19.4s, v0.8h, v5.8h\n"
"25:" // Height 1: Multiply loop: Main loop skip
"cbz x25, 28f\n"
"cbz x25, 28f\n"
@@ -345,32 +343,32 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"26:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr s0, [x24, #0x0]\n"
"27:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q23, [x28, #0x0]\n"
- "ldr q29, [x28, #0x10]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "ldr q22, [x28, #0x20]\n"
- "ldr q21, [x28, #0x30]\n"
- ".inst 0x6e57ec08 // bfmmla v8.4s, v0.8h, v23.8h\n"
- "ldr q24, [x28, #0x40]\n"
- ".inst 0x6e5dec0e // bfmmla v14.4s, v0.8h, v29.8h\n"
+ ".inst 0x6e55ec08 // bfmmla v8.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x20]\n"
+ "ldr q22, [x28, #0x30]\n"
+ ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e55ec09 // bfmmla v9.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x40]\n"
"ldr q23, [x28, #0x50]\n"
- ".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n"
- "ldr q22, [x28, #0x60]\n"
- ".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n"
- "ldr q21, [x28, #0x70]\n"
- ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
- "ldr q24, [x28, #0x80]\n"
+ ".inst 0x6e56ec0f // bfmmla v15.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0a // bfmmla v10.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x60]\n"
+ "ldr q22, [x28, #0x70]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x80]\n"
"ldr q23, [x28, #0x90]\n"
- ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e56ec11 // bfmmla v17.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0c // bfmmla v12.4s, v0.8h, v21.8h\n"
"ldr q22, [x28, #0xa0]\n"
- ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x28, #0xb0]\n"
- "add x28, x28, #0xc0\n"
- ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
+ "add x28, x28, #0xc0\n"
"28:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -384,9 +382,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v12.2d, v12.2d, v18.2d\n"
"uzp1 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 29f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v22.4s }, [x21]\n"
"ld1r { v21.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v22.4s\n"
"fmin v9.4s, v9.4s, v22.4s\n"
@@ -489,27 +487,27 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"bgt 2b\n"
"b 174f\n"
"44:" // Height 2
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"45:" // Height 2: Column loop
"cbz x10, 46f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x10, x10, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -519,117 +517,117 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x9, #0x18\n"
- "add x24, x27, x20, LSL #2\n"
+ "add x23, x27, x20, LSL #2\n"
"bge 59f\n"
"tbz x9, #4, 50f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v12.4s }, [x27], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
"tbz x9, #2, 48f\n"
"ld1 { v13.4s }, [x27], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
"tbz x9, #1, 47f\n"
"ldr d20, [x27], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
"tbz x9, #0, 58f\n"
"ld1 { v20.s }[2], [x27]\n"
- "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
"b 58f\n"
"47:" // Height 2: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x9, #0, 58f\n"
"ldr s20, [x27, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
"b 58f\n"
"48:" // Height 2: Partial accumulate: partial_2_16
"tbz x9, #1, 49f\n"
"ldr d13, [x27], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
"tbz x9, #0, 58f\n"
"ld1 { v13.s }[2], [x27]\n"
- "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
"b 58f\n"
"49:" // Height 2: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x9, #0, 58f\n"
"ldr s13, [x27, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
"b 58f\n"
"50:" // Height 2: Partial accumulate: partial_8_0
"tbz x9, #3, 54f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
"tbz x9, #2, 52f\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
"tbz x9, #1, 51f\n"
"ldr d12, [x27], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
"tbz x9, #0, 58f\n"
"ld1 { v12.s }[2], [x27]\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x9, #0, 58f\n"
"ldr s12, [x27, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
"b 58f\n"
"52:" // Height 2: Partial accumulate: partial_2_8
"tbz x9, #1, 53f\n"
"ldr d11, [x27], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
"tbz x9, #0, 58f\n"
"ld1 { v11.s }[2], [x27]\n"
- "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x9, #0, 58f\n"
"ldr s11, [x27, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
"b 58f\n"
"54:" // Height 2: Partial accumulate: partial_4_0
"tbz x9, #2, 56f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
"tbz x9, #1, 55f\n"
"ldr d10, [x27], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
"tbz x9, #0, 58f\n"
"ld1 { v10.s }[2], [x27]\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x9, #0, 58f\n"
"ldr s10, [x27, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
"b 58f\n"
"56:" // Height 2: Partial accumulate: partial_2_0
"tbz x9, #1, 57f\n"
"ldr d9, [x27], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
"tbz x9, #0, 58f\n"
"ld1 { v9.s }[2], [x27]\n"
- "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial accumulate: partial_1_0
"ldr s9, [x27, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
"58:" // Height 2: Partial accumulate: Done
"sub x27, x27, x20\n"
@@ -641,12 +639,12 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q12, [x27, #0x30]\n"
"ldr q13, [x27, #0x40]\n"
"ldr q20, [x27, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
"60:" // Height 2: MMLA fixup
"zip1 v8.2d, v9.2d, v14.2d\n"
"zip2 v14.2d, v9.2d, v14.2d\n"
@@ -678,8 +676,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"63:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -706,44 +704,41 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"blt 67f\n"
"66:" // Height 2: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "cmp x25, #0x8\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q29, [x28, #0x40]\n"
+ "ldr q3, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
"ldr q23, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"ldr q22, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"ldr q21, [x28, #0x70]\n"
- ".inst 0x6e5dec0a // bfmmla v10.4s, v0.8h, v29.8h\n"
- "ldr q30, [x28, #0x80]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ "ldr q1, [x28, #0x80]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
"ldr q23, [x28, #0x90]\n"
".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
"ldr q22, [x28, #0xa0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x28, #0xb0]\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x8\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e5eec0c // bfmmla v12.4s, v0.8h, v30.8h\n"
+ ".inst 0x6e41ec0c // bfmmla v12.4s, v0.8h, v1.8h\n"
"ldr q4, [x28, #0x0]\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x28, #0x10]\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x28, #0x20]\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
- "ld1 { v0.4s }, [x24], #0x10\n"
"ldr q7, [x28, #0x30]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1 { v0.4s }, [x24], #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ld1 { v1.4s }, [x23], #0x10\n"
"bge 66b\n"
"67:" // Height 2: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"ldr q24, [x28, #0x40]\n"
@@ -761,11 +756,14 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q22, [x28, #0xa0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
"ldr q21, [x28, #0xb0]\n"
- "add x28, x28, #0xc0\n"
+ "sub x25, x25, #0x4\n"
".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
+ "add x28, x28, #0xc0\n"
"68:" // Height 2: Multiply loop: Main loop skip
"cbz x25, 71f\n"
"cbz x25, 71f\n"
@@ -783,55 +781,55 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q24, [x28, #0x0]\n"
"ldr q23, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
+ ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
"ldr q22, [x28, #0x20]\n"
"ldr q21, [x28, #0x30]\n"
- ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e58ec08 // bfmmla v8.4s, v0.8h, v24.8h\n"
- "ldr q24, [x28, #0x40]\n"
".inst 0x6e57ec0e // bfmmla v14.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x28, #0x40]\n"
"ldr q23, [x28, #0x50]\n"
".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n"
- "ldr q22, [x28, #0x60]\n"
".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x28, #0x60]\n"
"ldr q21, [x28, #0x70]\n"
".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
- "ldr q24, [x28, #0x80]\n"
".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x28, #0x80]\n"
"ldr q23, [x28, #0x90]\n"
".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
- "ldr q22, [x28, #0xa0]\n"
".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x28, #0xa0]\n"
"ldr q21, [x28, #0xb0]\n"
- "add x28, x28, #0xc0\n"
".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
+ "add x28, x28, #0xc0\n"
"71:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 63b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
"uzp1 v4.2d, v8.2d, v14.2d\n"
"uzp2 v8.2d, v8.2d, v14.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v14.2d, v9.2d, v15.2d\n"
"uzp2 v9.2d, v9.2d, v15.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v15.2d, v10.2d, v16.2d\n"
"uzp2 v10.2d, v10.2d, v16.2d\n"
- "add x24, x27, x20, LSL #2\n"
"uzp1 v16.2d, v11.2d, v17.2d\n"
"uzp2 v11.2d, v11.2d, v17.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v17.2d, v12.2d, v18.2d\n"
"uzp2 v12.2d, v12.2d, v18.2d\n"
"uzp1 v18.2d, v13.2d, v19.2d\n"
"uzp2 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 72f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v22.4s }, [x21]\n"
"ld1r { v21.4s }, [x20]\n"
"fmin v4.4s, v4.4s, v22.4s\n"
"fmin v14.4s, v14.4s, v22.4s\n"
@@ -865,99 +863,99 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"st1 { v14.4s }, [x27], #0x10\n"
"st1 { v15.4s }, [x27], #0x10\n"
"st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v11.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v11.4s }, [x23], #0x10\n"
"tbz x9, #2, 74f\n"
"st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
"tbz x9, #1, 73f\n"
"str d18, [x27], #0x8\n"
- "str d13, [x24], #0x8\n"
+ "str d13, [x23], #0x8\n"
"tbz x9, #0, 84f\n"
"st1 { v18.s }[2], [x27]\n"
- "st1 { v13.s }[2], [x24]\n"
+ "st1 { v13.s }[2], [x23]\n"
"b 84f\n"
"73:" // Height 2: Partial direct writeback: partial_1_20
"tbz x9, #0, 84f\n"
"str s18, [x27, #0x0]\n"
- "str s13, [x24, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
"b 84f\n"
"74:" // Height 2: Partial direct writeback: partial_2_16
"tbz x9, #1, 75f\n"
"str d17, [x27], #0x8\n"
- "str d12, [x24], #0x8\n"
+ "str d12, [x23], #0x8\n"
"tbz x9, #0, 84f\n"
"st1 { v17.s }[2], [x27]\n"
- "st1 { v12.s }[2], [x24]\n"
+ "st1 { v12.s }[2], [x23]\n"
"b 84f\n"
"75:" // Height 2: Partial direct writeback: partial_1_16
"tbz x9, #0, 84f\n"
"str s17, [x27, #0x0]\n"
- "str s12, [x24, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
"b 84f\n"
"76:" // Height 2: Partial direct writeback: partial_8_0
"tbz x9, #3, 80f\n"
"st1 { v4.4s }, [x27], #0x10\n"
"st1 { v14.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
"tbz x9, #2, 78f\n"
"st1 { v15.4s }, [x27], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
"tbz x9, #1, 77f\n"
"str d16, [x27], #0x8\n"
- "str d11, [x24], #0x8\n"
+ "str d11, [x23], #0x8\n"
"tbz x9, #0, 84f\n"
"st1 { v16.s }[2], [x27]\n"
- "st1 { v11.s }[2], [x24]\n"
+ "st1 { v11.s }[2], [x23]\n"
"b 84f\n"
"77:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 84f\n"
"str s16, [x27, #0x0]\n"
- "str s11, [x24, #0x0]\n"
+ "str s11, [x23, #0x0]\n"
"b 84f\n"
"78:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 79f\n"
"str d15, [x27], #0x8\n"
- "str d10, [x24], #0x8\n"
+ "str d10, [x23], #0x8\n"
"tbz x9, #0, 84f\n"
"st1 { v15.s }[2], [x27]\n"
- "st1 { v10.s }[2], [x24]\n"
+ "st1 { v10.s }[2], [x23]\n"
"b 84f\n"
"79:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 84f\n"
"str s15, [x27, #0x0]\n"
- "str s10, [x24, #0x0]\n"
+ "str s10, [x23, #0x0]\n"
"b 84f\n"
"80:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 82f\n"
"st1 { v4.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
"tbz x9, #1, 81f\n"
"str d14, [x27], #0x8\n"
- "str d9, [x24], #0x8\n"
+ "str d9, [x23], #0x8\n"
"tbz x9, #0, 84f\n"
"st1 { v14.s }[2], [x27]\n"
- "st1 { v9.s }[2], [x24]\n"
+ "st1 { v9.s }[2], [x23]\n"
"b 84f\n"
"81:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 84f\n"
"str s14, [x27, #0x0]\n"
- "str s9, [x24, #0x0]\n"
+ "str s9, [x23, #0x0]\n"
"b 84f\n"
"82:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 83f\n"
"str d4, [x27], #0x8\n"
- "str d8, [x24], #0x8\n"
+ "str d8, [x23], #0x8\n"
"tbz x9, #0, 84f\n"
"st1 { v4.s }[2], [x27]\n"
- "st1 { v8.s }[2], [x24]\n"
+ "st1 { v8.s }[2], [x23]\n"
"b 84f\n"
"83:" // Height 2: Partial direct writeback: partial_1_0
"str s4, [x27, #0x0]\n"
- "str s8, [x24, #0x0]\n"
+ "str s8, [x23, #0x0]\n"
"84:" // Height 2: Partial direct writeback: Done
"b 86f\n"
"85:" // Height 2: Full writeback
@@ -968,38 +966,38 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"str q17, [x27, #0x40]\n"
"str q18, [x27, #0x50]\n"
"add x27, x27, #0x60\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q12, [x24, #0x40]\n"
- "str q13, [x24, #0x50]\n"
+ "str q8, [x23, #0x0]\n"
+ "str q9, [x23, #0x10]\n"
+ "str q10, [x23, #0x20]\n"
+ "str q11, [x23, #0x30]\n"
+ "str q12, [x23, #0x40]\n"
+ "str q13, [x23, #0x50]\n"
"86:" // Height 2: Writeback done
"subs x9, x9, #0x18\n"
"bgt 45b\n"
"b 174f\n"
"87:" // Height 3
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"88:" // Height 3: Column loop
"cbz x10, 89f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x10, x10, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -1020,147 +1018,147 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"89:" // Height 3: no bias
"tbz %x[flags], #0, 104f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
"cmp x9, #0x18\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 102f\n"
"tbz x9, #4, 93f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
"ld1 { v12.4s }, [x27], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
"tbz x9, #2, 91f\n"
"ld1 { v13.4s }, [x27], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x9, #1, 90f\n"
"ldr d20, [x27], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
"tbz x9, #0, 101f\n"
"ld1 { v20.s }[2], [x27]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v4.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v4.s }[2], [x22]\n"
"b 101f\n"
"90:" // Height 3: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x9, #0, 101f\n"
"ldr s20, [x27, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s4, [x23, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s4, [x22, #0x0]\n"
"b 101f\n"
"91:" // Height 3: Partial accumulate: partial_2_16
"tbz x9, #1, 92f\n"
"ldr d13, [x27], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x9, #0, 101f\n"
"ld1 { v13.s }[2], [x27]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 101f\n"
"92:" // Height 3: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x9, #0, 101f\n"
"ldr s13, [x27, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"b 101f\n"
"93:" // Height 3: Partial accumulate: partial_8_0
"tbz x9, #3, 97f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
"tbz x9, #2, 95f\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
"tbz x9, #1, 94f\n"
"ldr d12, [x27], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
"tbz x9, #0, 101f\n"
"ld1 { v12.s }[2], [x27]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
"b 101f\n"
"94:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x9, #0, 101f\n"
"ldr s12, [x27, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
"b 101f\n"
"95:" // Height 3: Partial accumulate: partial_2_8
"tbz x9, #1, 96f\n"
"ldr d11, [x27], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
- "ldr d23, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
"tbz x9, #0, 101f\n"
"ld1 { v11.s }[2], [x27]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
"b 101f\n"
"96:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x9, #0, 101f\n"
"ldr s11, [x27, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
"b 101f\n"
"97:" // Height 3: Partial accumulate: partial_4_0
"tbz x9, #2, 99f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
"tbz x9, #1, 98f\n"
"ldr d10, [x27], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz x9, #0, 101f\n"
"ld1 { v10.s }[2], [x27]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
"b 101f\n"
"98:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x9, #0, 101f\n"
"ldr s10, [x27, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
"b 101f\n"
"99:" // Height 3: Partial accumulate: partial_2_0
"tbz x9, #1, 100f\n"
"ldr d9, [x27], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
- "ldr d21, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
"tbz x9, #0, 101f\n"
"ld1 { v9.s }[2], [x27]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
"b 101f\n"
"100:" // Height 3: Partial accumulate: partial_1_0
"ldr s9, [x27, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s21, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
"101:" // Height 3: Partial accumulate: Done
"sub x27, x27, x20\n"
"b 103f\n"
@@ -1171,18 +1169,18 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q12, [x27, #0x30]\n"
"ldr q13, [x27, #0x40]\n"
"ldr q20, [x27, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q21, [x23, #0x0]\n"
- "ldr q22, [x23, #0x10]\n"
- "ldr q23, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q25, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q21, [x22, #0x0]\n"
+ "ldr q22, [x22, #0x10]\n"
+ "ldr q23, [x22, #0x20]\n"
+ "ldr q24, [x22, #0x30]\n"
+ "ldr q25, [x22, #0x40]\n"
+ "ldr q4, [x22, #0x50]\n"
"103:" // Height 3: MMLA fixup
"zip1 v8.2d, v9.2d, v14.2d\n"
"zip2 v14.2d, v9.2d, v14.2d\n"
@@ -1238,8 +1236,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"106:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 107f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1270,33 +1268,33 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"blt 110f\n"
"109:" // Height 3: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x25, #0x8\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x23], #0x10\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ "sub x25, x25, #0x4\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "cmp x25, #0x8\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q3, [x28, #0x70]\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x80]\n"
".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x90]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0xa0]\n"
".inst 0x6e43ec11 // bfmmla v17.4s, v0.8h, v3.8h\n"
@@ -1320,25 +1318,25 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"bge 109b\n"
"110:" // Height 3: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ "sub x25, x25, #0x4\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q3, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q4, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q1, [x28, #0x70]\n"
".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
"ldr q5, [x28, #0x80]\n"
".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
@@ -1379,21 +1377,21 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q5, [x28, #0x0]\n"
"ldr q4, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "ldr q3, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
+ "ldr q3, [x28, #0x20]\n"
+ "ldr q1, [x28, #0x30]\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x40]\n"
".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x50]\n"
".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5b // bfmmla v27.4s, v2.8h, v1.8h\n"
"ldr q1, [x28, #0x70]\n"
".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
@@ -1422,19 +1420,19 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"cmp x26, x20\n"
"bne 106b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 v4.2d, v8.2d, v14.2d\n"
"uzp2 v8.2d, v8.2d, v14.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v14.2d, v9.2d, v15.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v9.2d, v9.2d, v15.2d\n"
"uzp1 v15.2d, v10.2d, v16.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v10.2d, v10.2d, v16.2d\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v16.2d, v11.2d, v17.2d\n"
"uzp2 v11.2d, v11.2d, v17.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v17.2d, v12.2d, v18.2d\n"
"uzp2 v12.2d, v12.2d, v18.2d\n"
"uzp1 v18.2d, v13.2d, v19.2d\n"
@@ -1446,9 +1444,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v24.2d, v24.2d, v30.2d\n"
"uzp1 v25.2d, v25.2d, v31.2d\n"
"tbz %x[flags], #1, 115f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v4.4s, v4.4s, v1.4s\n"
"fmin v14.4s, v14.4s, v1.4s\n"
@@ -1494,126 +1492,126 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"st1 { v14.4s }, [x27], #0x10\n"
"st1 { v15.4s }, [x27], #0x10\n"
"st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v11.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v11.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
"tbz x9, #2, 117f\n"
"st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x9, #1, 116f\n"
"str d18, [x27], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x9, #0, 127f\n"
"st1 { v18.s }[2], [x27]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 127f\n"
"116:" // Height 3: Partial direct writeback: partial_1_20
"tbz x9, #0, 127f\n"
"str s18, [x27, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 127f\n"
"117:" // Height 3: Partial direct writeback: partial_2_16
"tbz x9, #1, 118f\n"
"str d17, [x27], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #0, 127f\n"
"st1 { v17.s }[2], [x27]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 127f\n"
"118:" // Height 3: Partial direct writeback: partial_1_16
"tbz x9, #0, 127f\n"
"str s17, [x27, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"b 127f\n"
"119:" // Height 3: Partial direct writeback: partial_8_0
"tbz x9, #3, 123f\n"
"st1 { v4.4s }, [x27], #0x10\n"
"st1 { v14.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
"tbz x9, #2, 121f\n"
"st1 { v15.4s }, [x27], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
"tbz x9, #1, 120f\n"
"str d16, [x27], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
+ "str d11, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
"tbz x9, #0, 127f\n"
"st1 { v16.s }[2], [x27]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
+ "st1 { v11.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
"b 127f\n"
"120:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 127f\n"
"str s16, [x27, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
+ "str s11, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
"b 127f\n"
"121:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 122f\n"
"str d15, [x27], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
+ "str d10, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
"tbz x9, #0, 127f\n"
"st1 { v15.s }[2], [x27]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
+ "st1 { v10.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
"b 127f\n"
"122:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 127f\n"
"str s15, [x27, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
+ "str s10, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
"b 127f\n"
"123:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 125f\n"
"st1 { v4.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
"tbz x9, #1, 124f\n"
"str d14, [x27], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
+ "str d9, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
"tbz x9, #0, 127f\n"
"st1 { v14.s }[2], [x27]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
+ "st1 { v9.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
"b 127f\n"
"124:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 127f\n"
"str s14, [x27, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
+ "str s9, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
"b 127f\n"
"125:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 126f\n"
"str d4, [x27], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
+ "str d8, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
"tbz x9, #0, 127f\n"
"st1 { v4.s }[2], [x27]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
+ "st1 { v8.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
"b 127f\n"
"126:" // Height 3: Partial direct writeback: partial_1_0
"str s4, [x27, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
+ "str s8, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
"127:" // Height 3: Partial direct writeback: Done
"b 129f\n"
"128:" // Height 3: Full writeback
@@ -1624,48 +1622,47 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"str q17, [x27, #0x40]\n"
"str q18, [x27, #0x50]\n"
"add x27, x27, #0x60\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q12, [x24, #0x40]\n"
- "str q13, [x24, #0x50]\n"
- "str q20, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q22, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q25, [x23, #0x50]\n"
+ "str q8, [x23, #0x0]\n"
+ "str q9, [x23, #0x10]\n"
+ "str q10, [x23, #0x20]\n"
+ "str q11, [x23, #0x30]\n"
+ "str q12, [x23, #0x40]\n"
+ "str q13, [x23, #0x50]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x22, #0x40]\n"
+ "str q25, [x22, #0x50]\n"
"129:" // Height 3: Writeback done
"subs x9, x9, #0x18\n"
"bgt 88b\n"
"b 174f\n"
"130:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x10\n"
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"131:" // Height 4: Column loop
"cbz x10, 132f\n"
"ldr q8, [x10, #0x0]\n"
"ldr q9, [x10, #0x10]\n"
- "ldr q10, [x10, #0x20]\n"
- "ldr q11, [x10, #0x30]\n"
- "ldr q12, [x10, #0x40]\n"
- "ldr q13, [x10, #0x50]\n"
- "add x10, x10, #0x60\n"
"zip2 v14.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x10, #0x20]\n"
+ "ldr q11, [x10, #0x30]\n"
"zip2 v15.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
+ "ldr q12, [x10, #0x40]\n"
+ "ldr q13, [x10, #0x50]\n"
"zip2 v16.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
"zip2 v17.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
+ "add x10, x10, #0x60\n"
"zip2 v18.2d, v12.2d, v12.2d\n"
"zip1 v12.2d, v12.2d, v12.2d\n"
"zip2 v19.2d, v13.2d, v13.2d\n"
@@ -1686,175 +1683,175 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"132:" // Height 4: no bias
"tbz %x[flags], #0, 147f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x9, #0x18\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x27, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x9, #0x18\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 145f\n"
"tbz x9, #4, 136f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v12.4s }, [x27], #0x10\n"
- "ld1 { v17.4s }, [x24], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x9, #2, 134f\n"
"ld1 { v13.4s }, [x27], #0x10\n"
- "ld1 { v18.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x9, #1, 133f\n"
"ldr d20, [x27], #0x8\n"
- "ldr d19, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"mov x20, #0x58\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x9, #0, 144f\n"
"ld1 { v20.s }[2], [x27]\n"
- "ld1 { v19.s }[2], [x24]\n"
- "ld1 { v4.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v4.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 144f\n"
"133:" // Height 4: Partial accumulate: partial_1_20
"mov x20, #0x50\n"
"tbz x9, #0, 144f\n"
"ldr s20, [x27, #0x0]\n"
- "ldr s19, [x24, #0x0]\n"
- "ldr s4, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s4, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 144f\n"
"134:" // Height 4: Partial accumulate: partial_2_16
"tbz x9, #1, 135f\n"
"ldr d13, [x27], #0x8\n"
- "ldr d18, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"mov x20, #0x48\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x9, #0, 144f\n"
"ld1 { v13.s }[2], [x27]\n"
- "ld1 { v18.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 144f\n"
"135:" // Height 4: Partial accumulate: partial_1_16
"mov x20, #0x40\n"
"tbz x9, #0, 144f\n"
"ldr s13, [x27, #0x0]\n"
- "ldr s18, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 144f\n"
"136:" // Height 4: Partial accumulate: partial_8_0
"tbz x9, #3, 140f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x27], #0x10\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
- "ld1 { v22.4s }, [x23], #0x10\n"
- "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v15.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v27.4s }, [x21], #0x10\n"
"tbz x9, #2, 138f\n"
"ld1 { v11.4s }, [x27], #0x10\n"
- "ld1 { v16.4s }, [x24], #0x10\n"
- "ld1 { v23.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v23.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x9, #1, 137f\n"
"ldr d12, [x27], #0x8\n"
- "ldr d17, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x9, #0, 144f\n"
"ld1 { v12.s }[2], [x27]\n"
- "ld1 { v17.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 144f\n"
"137:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x9, #0, 144f\n"
"ldr s12, [x27, #0x0]\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 144f\n"
"138:" // Height 4: Partial accumulate: partial_2_8
"tbz x9, #1, 139f\n"
"ldr d11, [x27], #0x8\n"
- "ldr d16, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"mov x20, #0x28\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x9, #0, 144f\n"
"ld1 { v11.s }[2], [x27]\n"
- "ld1 { v16.s }[2], [x24]\n"
- "ld1 { v23.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 144f\n"
"139:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x9, #0, 144f\n"
"ldr s11, [x27, #0x0]\n"
- "ldr s16, [x24, #0x0]\n"
- "ldr s23, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"b 144f\n"
"140:" // Height 4: Partial accumulate: partial_4_0
"tbz x9, #2, 142f\n"
"ld1 { v9.4s }, [x27], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v21.4s }, [x23], #0x10\n"
- "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
"tbz x9, #1, 141f\n"
"ldr d10, [x27], #0x8\n"
- "ldr d15, [x24], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
"mov x20, #0x18\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
"tbz x9, #0, 144f\n"
"ld1 { v10.s }[2], [x27]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
- "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
"b 144f\n"
"141:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x9, #0, 144f\n"
"ldr s10, [x27, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
- "ldr s27, [x22, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
"b 144f\n"
"142:" // Height 4: Partial accumulate: partial_2_0
"tbz x9, #1, 143f\n"
"ldr d9, [x27], #0x8\n"
- "ldr d14, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
"mov x20, #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d26, [x22], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
"tbz x9, #0, 144f\n"
"ld1 { v9.s }[2], [x27]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v21.s }[2], [x23]\n"
- "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
"b 144f\n"
"143:" // Height 4: Partial accumulate: partial_1_0
"ldr s9, [x27, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s21, [x23, #0x0]\n"
- "ldr s26, [x22, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
"144:" // Height 4: Partial accumulate: Done
"sub x27, x27, x20\n"
"b 146f\n"
@@ -1865,24 +1862,24 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr q12, [x27, #0x30]\n"
"ldr q13, [x27, #0x40]\n"
"ldr q20, [x27, #0x50]\n"
- "ldr q14, [x24, #0x0]\n"
- "ldr q15, [x24, #0x10]\n"
- "ldr q16, [x24, #0x20]\n"
- "ldr q17, [x24, #0x30]\n"
- "ldr q18, [x24, #0x40]\n"
- "ldr q19, [x24, #0x50]\n"
- "ldr q21, [x23, #0x0]\n"
- "ldr q22, [x23, #0x10]\n"
- "ldr q23, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q25, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
- "ldr q26, [x22, #0x0]\n"
- "ldr q27, [x22, #0x10]\n"
- "ldr q28, [x22, #0x20]\n"
- "ldr q29, [x22, #0x30]\n"
- "ldr q30, [x22, #0x40]\n"
- "ldr q31, [x22, #0x50]\n"
+ "ldr q14, [x23, #0x0]\n"
+ "ldr q15, [x23, #0x10]\n"
+ "ldr q16, [x23, #0x20]\n"
+ "ldr q17, [x23, #0x30]\n"
+ "ldr q18, [x23, #0x40]\n"
+ "ldr q19, [x23, #0x50]\n"
+ "ldr q21, [x22, #0x0]\n"
+ "ldr q22, [x22, #0x10]\n"
+ "ldr q23, [x22, #0x20]\n"
+ "ldr q24, [x22, #0x30]\n"
+ "ldr q25, [x22, #0x40]\n"
+ "ldr q4, [x22, #0x50]\n"
+ "ldr q26, [x21, #0x0]\n"
+ "ldr q27, [x21, #0x10]\n"
+ "ldr q28, [x21, #0x20]\n"
+ "ldr q29, [x21, #0x30]\n"
+ "ldr q30, [x21, #0x40]\n"
+ "ldr q31, [x21, #0x50]\n"
"146:" // Height 4: MMLA fixup
"zip1 v8.2d, v9.2d, v14.2d\n"
"zip2 v14.2d, v9.2d, v14.2d\n"
@@ -1938,8 +1935,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"mov x26, #0x0\n"
"149:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 150f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1976,28 +1973,28 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x25, x25, #0x4\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"cmp x25, #0x8\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- "ld1 { v3.4s }, [x21], #0x10\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x40]\n"
+ ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q5, [x28, #0x50]\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0x60]\n"
+ ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q7, [x28, #0x70]\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "ld1 { v3.4s }, [x21], #0x10\n"
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
"ldr q4, [x28, #0x80]\n"
".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
@@ -2030,21 +2027,21 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x25, x25, #0x4\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"ldr q3, [x28, #0x40]\n"
+ ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0x60]\n"
+ ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
"ldr q1, [x28, #0x70]\n"
".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
@@ -2135,23 +2132,23 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"cmp x26, x20\n"
"bne 149b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 v4.2d, v8.2d, v14.2d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v14.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v14.2d, v9.2d, v15.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
"uzp2 v9.2d, v9.2d, v15.2d\n"
"uzp1 v15.2d, v10.2d, v16.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v10.2d, v10.2d, v16.2d\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v16.2d, v11.2d, v17.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v11.2d, v11.2d, v17.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v17.2d, v12.2d, v18.2d\n"
"uzp2 v12.2d, v12.2d, v18.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v18.2d, v13.2d, v19.2d\n"
"uzp2 v13.2d, v13.2d, v19.2d\n"
"uzp1 v19.2d, v20.2d, v26.2d\n"
@@ -2167,9 +2164,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v30.2d, v25.2d, v31.2d\n"
"uzp2 v25.2d, v25.2d, v31.2d\n"
"tbz %x[flags], #1, 158f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v4.4s, v4.4s, v1.4s\n"
"fmin v14.4s, v14.4s, v1.4s\n"
@@ -2227,153 +2224,153 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"st1 { v14.4s }, [x27], #0x10\n"
"st1 { v15.4s }, [x27], #0x10\n"
"st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v11.4s }, [x24], #0x10\n"
- "st1 { v19.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v27.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
- "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v11.4s }, [x23], #0x10\n"
+ "st1 { v19.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v27.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v23.4s }, [x21], #0x10\n"
"tbz x9, #2, 160f\n"
"st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v29.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
"tbz x9, #1, 159f\n"
"str d18, [x27], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d25, [x22], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
"tbz x9, #0, 170f\n"
"st1 { v18.s }[2], [x27]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v25.s }[2], [x22]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
"b 170f\n"
"159:" // Height 4: Partial direct writeback: partial_1_20
"tbz x9, #0, 170f\n"
"str s18, [x27, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s25, [x22, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
"b 170f\n"
"160:" // Height 4: Partial direct writeback: partial_2_16
"tbz x9, #1, 161f\n"
"str d17, [x27], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x9, #0, 170f\n"
"st1 { v17.s }[2], [x27]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v24.s }[2], [x22]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
"b 170f\n"
"161:" // Height 4: Partial direct writeback: partial_1_16
"tbz x9, #0, 170f\n"
"str s17, [x27, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
"b 170f\n"
"162:" // Height 4: Partial direct writeback: partial_8_0
"tbz x9, #3, 166f\n"
"st1 { v4.4s }, [x27], #0x10\n"
"st1 { v14.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v19.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v9.4s }, [x23], #0x10\n"
+ "st1 { v19.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
"tbz x9, #2, 164f\n"
"st1 { v15.4s }, [x27], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v27.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v10.4s }, [x23], #0x10\n"
+ "st1 { v27.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
"tbz x9, #1, 163f\n"
"str d16, [x27], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d28, [x23], #0x8\n"
- "str d23, [x22], #0x8\n"
+ "str d11, [x23], #0x8\n"
+ "str d28, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
"tbz x9, #0, 170f\n"
"st1 { v16.s }[2], [x27]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v28.s }[2], [x23]\n"
- "st1 { v23.s }[2], [x22]\n"
+ "st1 { v11.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
"b 170f\n"
"163:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 170f\n"
"str s16, [x27, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s28, [x23, #0x0]\n"
- "str s23, [x22, #0x0]\n"
+ "str s11, [x23, #0x0]\n"
+ "str s28, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
"b 170f\n"
"164:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 165f\n"
"str d15, [x27], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d22, [x22], #0x8\n"
+ "str d10, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
"tbz x9, #0, 170f\n"
"st1 { v15.s }[2], [x27]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v22.s }[2], [x22]\n"
+ "st1 { v10.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
"b 170f\n"
"165:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 170f\n"
"str s15, [x27, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s22, [x22, #0x0]\n"
+ "str s10, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
"b 170f\n"
"166:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 168f\n"
"st1 { v4.4s }, [x27], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v19.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x23], #0x10\n"
+ "st1 { v19.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
"tbz x9, #1, 167f\n"
"str d14, [x27], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d21, [x22], #0x8\n"
+ "str d9, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
"tbz x9, #0, 170f\n"
"st1 { v14.s }[2], [x27]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v21.s }[2], [x22]\n"
+ "st1 { v9.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
"b 170f\n"
"167:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 170f\n"
"str s14, [x27, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s21, [x22, #0x0]\n"
+ "str s9, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
"b 170f\n"
"168:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 169f\n"
"str d4, [x27], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
+ "str d8, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
"tbz x9, #0, 170f\n"
"st1 { v4.s }[2], [x27]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "st1 { v20.s }[2], [x22]\n"
+ "st1 { v8.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
"b 170f\n"
"169:" // Height 4: Partial direct writeback: partial_1_0
"str s4, [x27, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "str s20, [x22, #0x0]\n"
+ "str s8, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
"170:" // Height 4: Partial direct writeback: Done
"b 172f\n"
"171:" // Height 4: Full writeback
@@ -2384,24 +2381,24 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"str q17, [x27, #0x40]\n"
"str q18, [x27, #0x50]\n"
"add x27, x27, #0x60\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q12, [x24, #0x40]\n"
- "str q13, [x24, #0x50]\n"
- "str q19, [x23, #0x0]\n"
- "str q26, [x23, #0x10]\n"
- "str q27, [x23, #0x20]\n"
- "str q28, [x23, #0x30]\n"
- "str q29, [x23, #0x40]\n"
- "str q30, [x23, #0x50]\n"
- "str q20, [x22, #0x0]\n"
- "str q21, [x22, #0x10]\n"
- "str q22, [x22, #0x20]\n"
- "str q23, [x22, #0x30]\n"
- "str q24, [x22, #0x40]\n"
- "str q25, [x22, #0x50]\n"
+ "str q8, [x23, #0x0]\n"
+ "str q9, [x23, #0x10]\n"
+ "str q10, [x23, #0x20]\n"
+ "str q11, [x23, #0x30]\n"
+ "str q12, [x23, #0x40]\n"
+ "str q13, [x23, #0x50]\n"
+ "str q19, [x22, #0x0]\n"
+ "str q26, [x22, #0x10]\n"
+ "str q27, [x22, #0x20]\n"
+ "str q28, [x22, #0x30]\n"
+ "str q29, [x22, #0x40]\n"
+ "str q30, [x22, #0x50]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x21, #0x40]\n"
+ "str q25, [x21, #0x50]\n"
"172:" // Height 4: Writeback done
"subs x9, x9, #0x18\n"
"bgt 131b\n"
@@ -2417,8 +2414,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"174:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index 6780b76a3a..191528d7f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
index d4a4f18d2b..9dde5b0b92 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -48,19 +48,18 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,23 +102,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"cmp %x[M], #0x2\n"
"bgt 71f\n"
"beq 36f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"cbz x12, 3f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"b 15f\n"
@@ -211,8 +209,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"16:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -233,9 +231,6 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"blt 20f\n"
"19:" // Height 1: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "cmp x27, #0x8\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
@@ -248,17 +243,18 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x8\n"
".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
+ "add x10, x10, #0x80\n"
"ldr q6, [x10, #0x0]\n"
".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
- "ld1 { v0.4s }, [x26], #0x10\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1 { v0.4s }, [x26], #0x10\n"
"bge 19b\n"
"20:" // Height 1: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
@@ -271,9 +267,11 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "sub x27, x27, #0x4\n"
".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x80\n"
"21:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 24f\n"
"cbz x27, 24f\n"
@@ -300,9 +298,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
+ "add x10, x10, #0x80\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -314,9 +312,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp1 v10.2d, v10.2d, v14.2d\n"
"uzp1 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v18.4s }, [x21]\n"
"ld1r { v17.4s }, [x20]\n"
"fmin v8.4s, v8.4s, v18.4s\n"
"fmin v9.4s, v9.4s, v18.4s\n"
@@ -386,23 +384,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"bgt 2b\n"
"b 212f\n"
"36:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"37:" // Height 2: Column loop
"cbz x12, 38f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"b 50f\n"
@@ -410,75 +408,75 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"tbz %x[flags], #0, 49f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"bge 47f\n"
"tbz x11, #3, 42f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
"tbz x11, #2, 40f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
"tbz x11, #1, 39f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
"tbz x11, #0, 46f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
+ "ld1 { v15.s }[2], [x25]\n"
"b 46f\n"
"39:" // Height 2: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 46f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
"b 46f\n"
"40:" // Height 2: Partial accumulate: partial_2_8
"tbz x11, #1, 41f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
"tbz x11, #0, 46f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
+ "ld1 { v14.s }[2], [x25]\n"
"b 46f\n"
"41:" // Height 2: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 46f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
"b 46f\n"
"42:" // Height 2: Partial accumulate: partial_4_0
"tbz x11, #2, 44f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
"tbz x11, #1, 43f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
"tbz x11, #0, 46f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
+ "ld1 { v13.s }[2], [x25]\n"
"b 46f\n"
"43:" // Height 2: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 46f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
"b 46f\n"
"44:" // Height 2: Partial accumulate: partial_2_0
"tbz x11, #1, 45f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
"tbz x11, #0, 46f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
+ "ld1 { v12.s }[2], [x25]\n"
"b 46f\n"
"45:" // Height 2: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
"46:" // Height 2: Partial accumulate: Done
"sub x9, x9, x20\n"
@@ -488,10 +486,10 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
"48:" // Height 2: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -515,8 +513,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"51:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 52f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -541,12 +539,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"blt 55f\n"
"54:" // Height 2: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "cmp x27, #0x8\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
@@ -559,18 +552,20 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x8\n"
"add x10, x10, #0x80\n"
".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
"ldr q6, [x10, #0x0]\n"
".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
- "ld1 { v0.4s }, [x26], #0x10\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1 { v0.4s }, [x26], #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "ld1 { v1.4s }, [x25], #0x10\n"
"bge 54b\n"
"55:" // Height 2: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
"ldr q18, [x10, #0x20]\n"
@@ -584,9 +579,12 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "sub x27, x27, #0x4\n"
".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x10, x10, #0x80\n"
"56:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 59f\n"
"cbz x27, 59f\n"
@@ -617,30 +615,30 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
+ "add x10, x10, #0x80\n"
"59:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 51b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"uzp1 v6.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"tbz %x[flags], #1, 60f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v18.4s }, [x21]\n"
"ld1r { v17.4s }, [x20]\n"
"fmin v6.4s, v6.4s, v18.4s\n"
"fmin v12.4s, v12.4s, v18.4s\n"
@@ -664,63 +662,63 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"tbz x11, #3, 64f\n"
"st1 { v6.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
"tbz x11, #2, 62f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
"tbz x11, #1, 61f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
+ "str d11, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
+ "st1 { v11.s }[2], [x25]\n"
"b 68f\n"
"61:" // Height 2: Partial direct writeback: partial_1_12
"tbz x11, #0, 68f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
"b 68f\n"
"62:" // Height 2: Partial direct writeback: partial_2_8
"tbz x11, #1, 63f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
+ "str d10, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
+ "st1 { v10.s }[2], [x25]\n"
"b 68f\n"
"63:" // Height 2: Partial direct writeback: partial_1_8
"tbz x11, #0, 68f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
"b 68f\n"
"64:" // Height 2: Partial direct writeback: partial_4_0
"tbz x11, #2, 66f\n"
"st1 { v6.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
"tbz x11, #1, 65f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
+ "str d9, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
+ "st1 { v9.s }[2], [x25]\n"
"b 68f\n"
"65:" // Height 2: Partial direct writeback: partial_1_4
"tbz x11, #0, 68f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
"b 68f\n"
"66:" // Height 2: Partial direct writeback: partial_2_0
"tbz x11, #1, 67f\n"
"str d6, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
+ "str d8, [x25], #0x8\n"
"tbz x11, #0, 68f\n"
"st1 { v6.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
+ "st1 { v8.s }[2], [x25]\n"
"b 68f\n"
"67:" // Height 2: Partial direct writeback: partial_1_0
"str s6, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
"68:" // Height 2: Partial direct writeback: Done
"b 70f\n"
"69:" // Height 2: Full writeback
@@ -729,32 +727,32 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
"70:" // Height 2: Writeback done
"subs x11, x11, #0x10\n"
"bgt 37b\n"
"b 212f\n"
"71:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"72:" // Height 3: Column loop
"cbz x12, 73f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -769,94 +767,94 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"73:" // Height 3: no bias
"tbz %x[flags], #0, 84f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"bge 82f\n"
"tbz x11, #3, 77f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #2, 75f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
"tbz x11, #1, 74f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
"b 81f\n"
"74:" // Height 3: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 81f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
"b 81f\n"
"75:" // Height 3: Partial accumulate: partial_2_8
"tbz x11, #1, 76f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
"b 81f\n"
"76:" // Height 3: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 81f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
"b 81f\n"
"77:" // Height 3: Partial accumulate: partial_4_0
"tbz x11, #2, 79f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #1, 78f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
"b 81f\n"
"78:" // Height 3: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 81f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
"b 81f\n"
"79:" // Height 3: Partial accumulate: partial_2_0
"tbz x11, #1, 80f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
"tbz x11, #0, 81f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 81f\n"
"80:" // Height 3: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
"81:" // Height 3: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 83f\n"
@@ -865,14 +863,14 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
"83:" // Height 3: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -912,8 +910,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"86:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 87f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -942,33 +940,33 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"blt 90f\n"
"89:" // Height 3: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "cmp x27, #0x8\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x25], #0x10\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
+ "cmp x27, #0x8\n"
".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
"ldr q6, [x10, #0x0]\n"
@@ -980,25 +978,25 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"bge 89b\n"
"90:" // Height 3: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
@@ -1029,13 +1027,13 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q26, [x10, #0x0]\n"
"ldr q25, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e5aec50 // bfmmla v16.4s, v2.8h, v26.8h\n"
- ".inst 0x6e59ec54 // bfmmla v20.4s, v2.8h, v25.8h\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e5aec08 // bfmmla v8.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec50 // bfmmla v16.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e59ec0c // bfmmla v12.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec54 // bfmmla v20.4s, v2.8h, v25.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
@@ -1060,27 +1058,27 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 86b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v6.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
"uzp1 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 95f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v26.4s }, [x21]\n"
"ld1r { v25.4s }, [x20]\n"
"fmin v6.4s, v6.4s, v26.4s\n"
"fmin v12.4s, v12.4s, v26.4s\n"
@@ -1112,79 +1110,79 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"tbz x11, #3, 99f\n"
"st1 { v6.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
- "st1 { v17.4s }, [x25], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
"tbz x11, #2, 97f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v18.4s }, [x25], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
"tbz x11, #1, 96f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d19, [x25], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v19.s }[2], [x25]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
"b 103f\n"
"96:" // Height 3: Partial direct writeback: partial_1_12
"tbz x11, #0, 103f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s19, [x25, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
"b 103f\n"
"97:" // Height 3: Partial direct writeback: partial_2_8
"tbz x11, #1, 98f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d18, [x25], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
"b 103f\n"
"98:" // Height 3: Partial direct writeback: partial_1_8
"tbz x11, #0, 103f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s18, [x25, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
"b 103f\n"
"99:" // Height 3: Partial direct writeback: partial_4_0
"tbz x11, #2, 101f\n"
"st1 { v6.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v16.4s }, [x25], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
"tbz x11, #1, 100f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d17, [x25], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v17.s }[2], [x25]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
"b 103f\n"
"100:" // Height 3: Partial direct writeback: partial_1_4
"tbz x11, #0, 103f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s17, [x25, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
"b 103f\n"
"101:" // Height 3: Partial direct writeback: partial_2_0
"tbz x11, #1, 102f\n"
"str d6, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x11, #0, 103f\n"
"st1 { v6.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v16.s }[2], [x25]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
"b 103f\n"
"102:" // Height 3: Partial direct writeback: partial_1_0
"str s6, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s16, [x25, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
"103:" // Height 3: Partial direct writeback: Done
"b 105f\n"
"104:" // Height 3: Full writeback
@@ -1193,36 +1191,36 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q16, [x25, #0x0]\n"
- "str q17, [x25, #0x10]\n"
- "str q18, [x25, #0x20]\n"
- "str q19, [x25, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
"105:" // Height 3: Writeback done
"subs x11, x11, #0x10\n"
"bgt 72b\n"
"b 212f\n"
"106:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"107:" // Height 4: Column loop
"cbz x12, 108f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -1237,111 +1235,111 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"108:" // Height 4: no bias
"tbz %x[flags], #0, 119f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x23, x24, x20, LSL #2\n"
"bge 117f\n"
"tbz x11, #3, 112f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
"tbz x11, #2, 110f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
"tbz x11, #1, 109f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"b 116f\n"
"109:" // Height 4: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 116f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"b 116f\n"
"110:" // Height 4: Partial accumulate: partial_2_8
"tbz x11, #1, 111f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
"b 116f\n"
"111:" // Height 4: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 116f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
"b 116f\n"
"112:" // Height 4: Partial accumulate: partial_4_0
"tbz x11, #2, 114f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
"tbz x11, #1, 113f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
"b 116f\n"
"113:" // Height 4: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 116f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
"b 116f\n"
"114:" // Height 4: Partial accumulate: partial_2_0
"tbz x11, #1, 115f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"tbz x11, #0, 116f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
"b 116f\n"
"115:" // Height 4: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
"116:" // Height 4: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 118f\n"
@@ -1350,18 +1348,18 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
"118:" // Height 4: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -1401,8 +1399,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"121:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 122f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1437,28 +1435,28 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"cmp x27, #0x8\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- "ld1 { v3.4s }, [x23], #0x10\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ld1 { v3.4s }, [x23], #0x10\n"
".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
@@ -1479,18 +1477,18 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x27, x27, #0x4\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
@@ -1551,8 +1549,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
"ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
@@ -1562,23 +1560,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 121b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v6.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
@@ -1586,9 +1584,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 130f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v26.4s }, [x21]\n"
"ld1r { v25.4s }, [x20]\n"
"fmin v6.4s, v6.4s, v26.4s\n"
"fmin v12.4s, v12.4s, v26.4s\n"
@@ -1628,95 +1626,95 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"tbz x11, #3, 134f\n"
"st1 { v6.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
"tbz x11, #2, 132f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
"tbz x11, #1, 131f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
"b 138f\n"
"131:" // Height 4: Partial direct writeback: partial_1_12
"tbz x11, #0, 138f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
"b 138f\n"
"132:" // Height 4: Partial direct writeback: partial_2_8
"tbz x11, #1, 133f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
"b 138f\n"
"133:" // Height 4: Partial direct writeback: partial_1_8
"tbz x11, #0, 138f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
"b 138f\n"
"134:" // Height 4: Partial direct writeback: partial_4_0
"tbz x11, #2, 136f\n"
"st1 { v6.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
"tbz x11, #1, 135f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
"b 138f\n"
"135:" // Height 4: Partial direct writeback: partial_1_4
"tbz x11, #0, 138f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
"b 138f\n"
"136:" // Height 4: Partial direct writeback: partial_2_0
"tbz x11, #1, 137f\n"
"str d6, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x11, #0, 138f\n"
"st1 { v6.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
"b 138f\n"
"137:" // Height 4: Partial direct writeback: partial_1_0
"str s6, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
"138:" // Height 4: Partial direct writeback: Done
"b 140f\n"
"139:" // Height 4: Full writeback
@@ -1725,40 +1723,40 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
"140:" // Height 4: Writeback done
"subs x11, x11, #0x10\n"
"bgt 107b\n"
"b 212f\n"
"141:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"142:" // Height 5: Column loop
"cbz x12, 143f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -1781,128 +1779,128 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"143:" // Height 5: no bias
"tbz %x[flags], #0, 154f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x22, x23, x20, LSL #2\n"
"bge 152f\n"
"tbz x11, #3, 147f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #2, 145f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v27.4s }, [x23], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v27.4s }, [x22], #0x10\n"
"tbz x11, #1, 144f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d6, [x23], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d6, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v6.s }[2], [x23]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v6.s }[2], [x22]\n"
"b 151f\n"
"144:" // Height 5: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 151f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s6, [x23, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s6, [x22, #0x0]\n"
"b 151f\n"
"145:" // Height 5: Partial accumulate: partial_2_8
"tbz x11, #1, 146f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
"b 151f\n"
"146:" // Height 5: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 151f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
"b 151f\n"
"147:" // Height 5: Partial accumulate: partial_4_0
"tbz x11, #2, 149f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #1, 148f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
"b 151f\n"
"148:" // Height 5: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 151f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
"b 151f\n"
"149:" // Height 5: Partial accumulate: partial_2_0
"tbz x11, #1, 150f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz x11, #0, 151f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
"b 151f\n"
"150:" // Height 5: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
"151:" // Height 5: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 153f\n"
@@ -1911,22 +1909,22 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q25, [x23, #0x0]\n"
- "ldr q26, [x23, #0x10]\n"
- "ldr q27, [x23, #0x20]\n"
- "ldr q6, [x23, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q25, [x22, #0x0]\n"
+ "ldr q26, [x22, #0x10]\n"
+ "ldr q27, [x22, #0x20]\n"
+ "ldr q6, [x22, #0x30]\n"
"153:" // Height 5: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -1982,8 +1980,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"156:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 157f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2022,31 +2020,31 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
"cmp x27, #0x8\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- "ld1 { v3.4s }, [x23], #0x10\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
+ "ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "ld1 { v1.4s }, [x25], #0x10\n"
+ ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
"ldr q5, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
"ldr q6, [x10, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ld1 { v3.4s }, [x23], #0x10\n"
".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e45ec9d // bfmmla v29.4s, v4.8h, v5.8h\n"
"ldr q5, [x10, #0x50]\n"
".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
@@ -2075,22 +2073,22 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x27, x27, #0x4\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
"ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
"ldr q1, [x10, #0x30]\n"
".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
"ldr q3, [x10, #0x40]\n"
@@ -2140,16 +2138,16 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q5, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec9c // bfmmla v28.4s, v4.8h, v5.8h\n"
+ ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
"ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec9c // bfmmla v28.4s, v4.8h, v5.8h\n"
"ldr q1, [x10, #0x30]\n"
".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
@@ -2167,8 +2165,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n"
".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n"
"ldr q1, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n"
".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
@@ -2180,27 +2178,27 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 156b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v6.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
@@ -2210,9 +2208,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
"tbz %x[flags], #1, 165f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v6.4s, v6.4s, v1.4s\n"
"fmin v12.4s, v12.4s, v1.4s\n"
@@ -2260,111 +2258,111 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"tbz x11, #3, 169f\n"
"st1 { v6.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
"tbz x11, #2, 167f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
"tbz x11, #1, 166f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d27, [x23], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v27.s }[2], [x23]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
"b 173f\n"
"166:" // Height 5: Partial direct writeback: partial_1_12
"tbz x11, #0, 173f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s27, [x23, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
"b 173f\n"
"167:" // Height 5: Partial direct writeback: partial_2_8
"tbz x11, #1, 168f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d26, [x23], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v26.s }[2], [x23]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
"b 173f\n"
"168:" // Height 5: Partial direct writeback: partial_1_8
"tbz x11, #0, 173f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s26, [x23, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
"b 173f\n"
"169:" // Height 5: Partial direct writeback: partial_4_0
"tbz x11, #2, 171f\n"
"st1 { v6.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
"tbz x11, #1, 170f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d25, [x23], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v25.s }[2], [x23]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
"b 173f\n"
"170:" // Height 5: Partial direct writeback: partial_1_4
"tbz x11, #0, 173f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
"b 173f\n"
"171:" // Height 5: Partial direct writeback: partial_2_0
"tbz x11, #1, 172f\n"
"str d6, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x11, #0, 173f\n"
"st1 { v6.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v24.s }[2], [x23]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
"b 173f\n"
"172:" // Height 5: Partial direct writeback: partial_1_0
"str s6, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s24, [x23, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
"173:" // Height 5: Partial direct writeback: Done
"b 175f\n"
"174:" // Height 5: Full writeback
@@ -2373,48 +2371,47 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
"175:" // Height 5: Writeback done
"subs x11, x11, #0x10\n"
"bgt 142b\n"
"b 212f\n"
"176:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"177:" // Height 6: Column loop
"cbz x12, 178f\n"
"ldr q8, [x12, #0x0]\n"
"ldr q9, [x12, #0x10]\n"
- "ldr q10, [x12, #0x20]\n"
- "ldr q11, [x12, #0x30]\n"
- "add x12, x12, #0x40\n"
"zip2 v12.2d, v8.2d, v8.2d\n"
"zip1 v8.2d, v8.2d, v8.2d\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
"zip2 v13.2d, v9.2d, v9.2d\n"
"zip1 v9.2d, v9.2d, v9.2d\n"
"zip2 v14.2d, v10.2d, v10.2d\n"
"zip1 v10.2d, v10.2d, v10.2d\n"
+ "add x12, x12, #0x40\n"
"zip2 v15.2d, v11.2d, v11.2d\n"
"zip1 v11.2d, v11.2d, v11.2d\n"
"mov v16.16b, v8.16b\n"
@@ -2437,145 +2434,145 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"178:" // Height 6: no bias
"tbz %x[flags], #0, 189f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "add x21, x22, x20, LSL #2\n"
"bge 187f\n"
"tbz x11, #3, 182f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x26], #0x10\n"
- "ld1 { v18.4s }, [x25], #0x10\n"
- "ld1 { v21.4s }, [x24], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x22], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
"tbz x11, #2, 180f\n"
"ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x26], #0x10\n"
- "ld1 { v19.4s }, [x25], #0x10\n"
- "ld1 { v22.4s }, [x24], #0x10\n"
- "ld1 { v27.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x22], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v19.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v27.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
"tbz x11, #1, 179f\n"
"ldr d16, [x9], #0x8\n"
- "ldr d15, [x26], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
"mov x20, #0x38\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d6, [x23], #0x8\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d6, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v6.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v6.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
"b 186f\n"
"179:" // Height 6: Partial accumulate: partial_1_12
"mov x20, #0x30\n"
"tbz x11, #0, 186f\n"
"ldr s16, [x9, #0x0]\n"
- "ldr s15, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s6, [x23, #0x0]\n"
- "ldr s31, [x22, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s24, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s6, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
"b 186f\n"
"180:" // Height 6: Partial accumulate: partial_2_8
"tbz x11, #1, 181f\n"
"ldr d11, [x9], #0x8\n"
- "ldr d14, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
"mov x20, #0x28\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x26]\n"
- "ld1 { v19.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
"b 186f\n"
"181:" // Height 6: Partial accumulate: partial_1_8
"mov x20, #0x20\n"
"tbz x11, #0, 186f\n"
"ldr s11, [x9, #0x0]\n"
- "ldr s14, [x26, #0x0]\n"
- "ldr s19, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
"b 186f\n"
"182:" // Height 6: Partial accumulate: partial_4_0
"tbz x11, #2, 184f\n"
"ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x26], #0x10\n"
- "ld1 { v17.4s }, [x25], #0x10\n"
- "ld1 { v20.4s }, [x24], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x22], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
"tbz x11, #1, 183f\n"
"ldr d10, [x9], #0x8\n"
- "ldr d13, [x26], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
"mov x20, #0x18\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d29, [x22], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
"b 186f\n"
"183:" // Height 6: Partial accumulate: partial_1_4
"mov x20, #0x10\n"
"tbz x11, #0, 186f\n"
"ldr s10, [x9, #0x0]\n"
- "ldr s13, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s29, [x22, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
"b 186f\n"
"184:" // Height 6: Partial accumulate: partial_2_0
"tbz x11, #1, 185f\n"
"ldr d9, [x9], #0x8\n"
- "ldr d12, [x26], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
"mov x20, #0x8\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz x11, #0, 186f\n"
"ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x26]\n"
- "ld1 { v17.s }[2], [x25]\n"
- "ld1 { v20.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
"b 186f\n"
"185:" // Height 6: Partial accumulate: partial_1_0
"ldr s9, [x9, #0x0]\n"
- "ldr s12, [x26, #0x0]\n"
+ "ldr s12, [x25, #0x0]\n"
"mov x20, #0x0\n"
- "ldr s17, [x25, #0x0]\n"
- "ldr s20, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s28, [x22, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
"186:" // Height 6: Partial accumulate: Done
"sub x9, x9, x20\n"
"b 188f\n"
@@ -2584,26 +2581,26 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr q10, [x9, #0x10]\n"
"ldr q11, [x9, #0x20]\n"
"ldr q16, [x9, #0x30]\n"
- "ldr q12, [x26, #0x0]\n"
- "ldr q13, [x26, #0x10]\n"
- "ldr q14, [x26, #0x20]\n"
- "ldr q15, [x26, #0x30]\n"
- "ldr q17, [x25, #0x0]\n"
- "ldr q18, [x25, #0x10]\n"
- "ldr q19, [x25, #0x20]\n"
- "ldr q24, [x25, #0x30]\n"
- "ldr q20, [x24, #0x0]\n"
- "ldr q21, [x24, #0x10]\n"
- "ldr q22, [x24, #0x20]\n"
- "ldr q23, [x24, #0x30]\n"
- "ldr q25, [x23, #0x0]\n"
- "ldr q26, [x23, #0x10]\n"
- "ldr q27, [x23, #0x20]\n"
- "ldr q6, [x23, #0x30]\n"
- "ldr q28, [x22, #0x0]\n"
- "ldr q29, [x22, #0x10]\n"
- "ldr q30, [x22, #0x20]\n"
- "ldr q31, [x22, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q19, [x24, #0x20]\n"
+ "ldr q24, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q25, [x22, #0x0]\n"
+ "ldr q26, [x22, #0x10]\n"
+ "ldr q27, [x22, #0x20]\n"
+ "ldr q6, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
"188:" // Height 6: MMLA fixup
"zip1 v8.2d, v9.2d, v12.2d\n"
"zip2 v12.2d, v9.2d, v12.2d\n"
@@ -2659,8 +2656,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"mov x28, #0x0\n"
"191:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 192f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2703,36 +2700,36 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
"sub x27, x27, #0x4\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
"cmp x27, #0x8\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ld1 { v1.4s }, [x25], #0x10\n"
+ ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- "ld1 { v3.4s }, [x23], #0x10\n"
".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "ld1 { v5.4s }, [x21], #0x10\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x20]\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q5, [x10, #0x30]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ld1 { v3.4s }, [x23], #0x10\n"
".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec9d // bfmmla v29.4s, v4.8h, v5.8h\n"
"ldr q7, [x10, #0x50]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "ld1 { v5.4s }, [x21], #0x10\n"
".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
@@ -2760,24 +2757,24 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"sub x27, x27, #0x4\n"
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
+ ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
"ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
"ldr q1, [x10, #0x30]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
"ldr q3, [x10, #0x40]\n"
".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n"
@@ -2831,13 +2828,13 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
"ldr q1, [x10, #0x30]\n"
@@ -2870,31 +2867,31 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"cmp x28, x20\n"
"bne 191b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 v6.2d, v8.2d, v12.2d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
@@ -2906,9 +2903,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
"tbz %x[flags], #1, 200f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v1.4s }, [x21]\n"
"ld1r { v0.4s }, [x20]\n"
"fmin v6.4s, v6.4s, v1.4s\n"
"fmin v12.4s, v12.4s, v1.4s\n"
@@ -2964,127 +2961,127 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"tbz x11, #3, 204f\n"
"st1 { v6.4s }, [x9], #0x10\n"
"st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v9.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v17.4s }, [x24], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
- "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v9.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
"tbz x11, #2, 202f\n"
"st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x26], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v18.4s }, [x24], #0x10\n"
- "st1 { v29.4s }, [x23], #0x10\n"
- "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v10.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v29.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
"tbz x11, #1, 201f\n"
"str d14, [x9], #0x8\n"
- "str d11, [x26], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d19, [x24], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d27, [x22], #0x8\n"
+ "str d11, [x25], #0x8\n"
+ "str d22, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x26]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v19.s }[2], [x24]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v27.s }[2], [x22]\n"
+ "st1 { v11.s }[2], [x25]\n"
+ "st1 { v22.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
"b 208f\n"
"201:" // Height 6: Partial direct writeback: partial_1_12
"tbz x11, #0, 208f\n"
"str s14, [x9, #0x0]\n"
- "str s11, [x26, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s19, [x24, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s27, [x22, #0x0]\n"
+ "str s11, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
"b 208f\n"
"202:" // Height 6: Partial direct writeback: partial_2_8
"tbz x11, #1, 203f\n"
"str d13, [x9], #0x8\n"
- "str d10, [x26], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d18, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d26, [x22], #0x8\n"
+ "str d10, [x25], #0x8\n"
+ "str d21, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x26]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v18.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v26.s }[2], [x22]\n"
+ "st1 { v10.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
"b 208f\n"
"203:" // Height 6: Partial direct writeback: partial_1_8
"tbz x11, #0, 208f\n"
"str s13, [x9, #0x0]\n"
- "str s10, [x26, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s18, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s26, [x22, #0x0]\n"
+ "str s10, [x25, #0x0]\n"
+ "str s21, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
"b 208f\n"
"204:" // Height 6: Partial direct writeback: partial_4_0
"tbz x11, #2, 206f\n"
"st1 { v6.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x26], #0x10\n"
- "st1 { v15.4s }, [x25], #0x10\n"
- "st1 { v16.4s }, [x24], #0x10\n"
- "st1 { v23.4s }, [x23], #0x10\n"
- "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v8.4s }, [x25], #0x10\n"
+ "st1 { v15.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v23.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
"tbz x11, #1, 205f\n"
"str d12, [x9], #0x8\n"
- "str d9, [x26], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d17, [x24], #0x8\n"
- "str d28, [x23], #0x8\n"
- "str d25, [x22], #0x8\n"
+ "str d9, [x25], #0x8\n"
+ "str d20, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d28, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x26]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v17.s }[2], [x24]\n"
- "st1 { v28.s }[2], [x23]\n"
- "st1 { v25.s }[2], [x22]\n"
+ "st1 { v9.s }[2], [x25]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
"b 208f\n"
"205:" // Height 6: Partial direct writeback: partial_1_4
"tbz x11, #0, 208f\n"
"str s12, [x9, #0x0]\n"
- "str s9, [x26, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s17, [x24, #0x0]\n"
- "str s28, [x23, #0x0]\n"
- "str s25, [x22, #0x0]\n"
+ "str s9, [x25, #0x0]\n"
+ "str s20, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s28, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
"b 208f\n"
"206:" // Height 6: Partial direct writeback: partial_2_0
"tbz x11, #1, 207f\n"
"str d6, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x11, #0, 208f\n"
"st1 { v6.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x26]\n"
- "st1 { v15.s }[2], [x25]\n"
- "st1 { v16.s }[2], [x24]\n"
- "st1 { v23.s }[2], [x23]\n"
- "st1 { v24.s }[2], [x22]\n"
+ "st1 { v8.s }[2], [x25]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
"b 208f\n"
"207:" // Height 6: Partial direct writeback: partial_1_0
"str s6, [x9, #0x0]\n"
- "str s8, [x26, #0x0]\n"
- "str s15, [x25, #0x0]\n"
- "str s16, [x24, #0x0]\n"
- "str s23, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
+ "str s8, [x25, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
"208:" // Height 6: Partial direct writeback: Done
"b 210f\n"
"209:" // Height 6: Full writeback
@@ -3093,26 +3090,26 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"str q13, [x9, #0x20]\n"
"str q14, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "str q8, [x26, #0x0]\n"
- "str q9, [x26, #0x10]\n"
- "str q10, [x26, #0x20]\n"
- "str q11, [x26, #0x30]\n"
- "str q15, [x25, #0x0]\n"
- "str q20, [x25, #0x10]\n"
- "str q21, [x25, #0x20]\n"
- "str q22, [x25, #0x30]\n"
- "str q16, [x24, #0x0]\n"
- "str q17, [x24, #0x10]\n"
- "str q18, [x24, #0x20]\n"
- "str q19, [x24, #0x30]\n"
- "str q23, [x23, #0x0]\n"
- "str q28, [x23, #0x10]\n"
- "str q29, [x23, #0x20]\n"
- "str q30, [x23, #0x30]\n"
- "str q24, [x22, #0x0]\n"
- "str q25, [x22, #0x10]\n"
- "str q26, [x22, #0x20]\n"
- "str q27, [x22, #0x30]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q9, [x25, #0x10]\n"
+ "str q10, [x25, #0x20]\n"
+ "str q11, [x25, #0x30]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q20, [x24, #0x10]\n"
+ "str q21, [x24, #0x20]\n"
+ "str q22, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q28, [x22, #0x10]\n"
+ "str q29, [x22, #0x20]\n"
+ "str q30, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
"210:" // Height 6: Writeback done
"subs x11, x11, #0x10\n"
"bgt 177b\n"
@@ -3128,8 +3125,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"212:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index becd43516f..857de3c6d0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -71,7 +71,7 @@ public:
return false;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
index 4d23660942..6ef2fd5d34 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
@@ -45,18 +45,18 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -84,133 +84,133 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"cmp %x[M], #0x2\n"
"bgt 61f\n"
"beq 31f\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v15.16b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
"3:" // Height 1: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 6f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
+ "add x9, x9, x20\n"
"b 6f\n"
"5:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
+ "mov x9, %x[input_ptr]\n"
"6:" // Height 1: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 11f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr d21, [x14, #0x70]\n"
- "ldr x20, [x14, #0x78]\n"
+ "ldr d21, [x12, #0x70]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr d20, [x14, #0x80]\n"
+ "ldr d20, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr d26, [x14, #0x90]\n"
+ "ldr d26, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr d25, [x14, #0xa0]\n"
+ "ldr d25, [x12, #0xa0]\n"
"mov v21.d[1], x20\n"
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x12, #0x88]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr d24, [x14, #0xb0]\n"
+ "ldr d24, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr d23, [x14, #0xc0]\n"
+ "ldr d23, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr d22, [x14, #0xd0]\n"
+ "ldr d22, [x12, #0xd0]\n"
".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n"
- "ldr d21, [x14, #0xe0]\n"
+ "ldr d21, [x12, #0xe0]\n"
"mov v20.d[1], x20\n"
- "ldr x22, [x14, #0x98]\n"
- "add x10, x10, #0x10\n"
- "ldr x21, [x14, #0xa8]\n"
+ "ldr x20, [x12, #0x98]\n"
+ "mov v26.d[1], x20\n"
+ "ldr x20, [x12, #0xa8]\n"
+ "mov v25.d[1], x20\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "mov v24.d[1], x20\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
- "ldr d20, [x14, #0xf0]\n"
- "ldr x20, [x14, #0xb8]\n"
- "mov v26.d[1], x22\n"
- "mov v25.d[1], x21\n"
- "ldr x23, [x14, #0xc8]\n"
- "ldr x22, [x14, #0xd8]\n"
+ "ldr d20, [x12, #0xf0]\n"
".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
- "mov v24.d[1], x20\n"
- "ldr x21, [x14, #0xe8]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
+ "ldr x20, [x12, #0xf8]\n"
"mov v23.d[1], x23\n"
"mov v22.d[1], x22\n"
- "add x14, x14, #0x100\n"
+ "add x9, x9, #0x10\n"
"mov v21.d[1], x21\n"
- ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
+ "add x12, x12, #0x100\n"
"mov v20.d[1], x20\n"
+ ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q4, [x14, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q4, [x12, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q21, [x14, #0x70]\n"
+ "ldr q21, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q20, [x14, #0x80]\n"
+ "ldr q20, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q26, [x14, #0x90]\n"
+ "ldr q26, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q25, [x14, #0xa0]\n"
+ "ldr q25, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q24, [x14, #0xb0]\n"
+ "ldr q24, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q23, [x14, #0xc0]\n"
+ "ldr q23, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q22, [x14, #0xd0]\n"
+ "ldr q22, [x12, #0xd0]\n"
".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n"
- "ldr q21, [x14, #0xe0]\n"
+ "ldr q21, [x12, #0xe0]\n"
".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
- "ldr q20, [x14, #0xf0]\n"
+ "ldr q20, [x12, #0xf0]\n"
".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
@@ -218,54 +218,54 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"tbnz %x[flags], #31, 10f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"10:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"11:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 18f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 18f\n"
+ "cmp x10, #0x4\n"
"blt 14f\n"
"12:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
"tbnz %x[flags], #31, 13f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q23, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q22, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q21, [x14, #0x20]\n"
- "ldr q20, [x14, #0x30]\n"
- ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q22, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q21, [x12, #0x20]\n"
+ ".inst 0x4f80e290 // sdot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x11, 18f\n"
- "tbz x11, #1, 15f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 16f\n"
- "ld1 { v0.b }[2], [x10]\n"
+ "cbz x10, 18f\n"
+ "tbz x10, #1, 15f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 16f\n"
+ "ld1 { v0.b }[2], [x9]\n"
"b 16f\n"
"15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
"16:" // Height 1: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 17f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q23, [x14, #0x0]\n"
- "ldr q22, [x14, #0x10]\n"
- "ldr q21, [x14, #0x20]\n"
- "ldr q20, [x14, #0x30]\n"
- ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n"
- ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
- ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x0]\n"
+ ".inst 0x4f80e290 // sdot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x10]\n"
+ ".inst 0x4f80e291 // sdot v17.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x20]\n"
+ ".inst 0x4f80e292 // sdot v18.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 4b\n"
"prfm pstl1keep, [x13, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
@@ -276,28 +276,28 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"addp v11.4s, v11.4s, v11.4s\n"
"mul v11.4s, v11.4s, v20.4s\n"
"19:" // Height 1: skip row sum fixup
- "ldr q24, [x16, #0x0]\n"
+ "ldr q23, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q23, [x16, #0x10]\n"
+ "ldr q22, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q22, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q21, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
+ "add v16.4s, v16.4s, v23.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v20.4s }, [x20]\n"
- "add v16.4s, v16.4s, v24.4s\n"
- "add v17.4s, v17.4s, v23.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v21.4s\n"
- "add x16, x16, #0x40\n"
"sqrdmulh v16.4s, v16.4s, v20.4s\n"
"sqrdmulh v17.4s, v17.4s, v20.4s\n"
"sqrdmulh v18.4s, v18.4s, v20.4s\n"
"sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 20f\n"
"and v23.16b, v16.16b, v0.16b\n"
"and v22.16b, v17.16b, v0.16b\n"
@@ -317,67 +317,67 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x20]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v20.4s\n"
+ "add v17.4s, v17.4s, v20.4s\n"
+ "add v18.4s, v18.4s, v20.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v21.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v20.4s }, [x20]\n"
- "add v16.4s, v16.4s, v22.4s\n"
- "add v17.4s, v17.4s, v22.4s\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v22.4s\n"
- "smin v16.4s, v16.4s, v21.4s\n"
- "smin v17.4s, v17.4s, v21.4s\n"
- "smin v18.4s, v18.4s, v21.4s\n"
- "smin v19.4s, v19.4s, v21.4s\n"
"smax v16.4s, v16.4s, v20.4s\n"
"smax v17.4s, v17.4s, v20.4s\n"
"smax v18.4s, v18.4s, v20.4s\n"
"smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 29f\n"
- "tbz x15, #3, 24f\n"
+ "tbz x14, #3, 24f\n"
"str d16, [x13], #0x8\n"
- "tbz x15, #2, 22f\n"
+ "tbz x14, #2, 22f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "tbz x15, #1, 21f\n"
+ "tbz x14, #1, 21f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[14], [x13]\n"
"b 28f\n"
"21:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[12], [x13]\n"
"b 28f\n"
"22:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 23f\n"
+ "tbz x14, #1, 23f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[10], [x13]\n"
"b 28f\n"
"23:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[8], [x13]\n"
"b 28f\n"
"24:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 26f\n"
+ "tbz x14, #2, 26f\n"
"str s16, [x13], #0x4\n"
- "tbz x15, #1, 25f\n"
+ "tbz x14, #1, 25f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[6], [x13]\n"
"b 28f\n"
"25:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[4], [x13]\n"
"b 28f\n"
"26:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 27f\n"
+ "tbz x14, #1, 27f\n"
"str h16, [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[2], [x13]\n"
"b 28f\n"
"27:" // Height 1: Partial direct writeback: partial_1_0
@@ -388,18 +388,18 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
"30:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 2b\n"
"b 122f\n"
"31:" // Height 2
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v15.16b, #0x1\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -410,80 +410,80 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
"33:" // Height 2: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 35f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "cbnz x12, 36f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x11, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
"add x9, x9, x20\n"
+ "add x28, x28, x20\n"
"b 36f\n"
"35:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x9, x10, x21\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
"36:" // Height 2: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 41f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 39f\n"
"37:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr d25, [x14, #0x70]\n"
+ "ldr d25, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr x23, [x14, #0x88]\n"
+ "mov v25.d[1], x20\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr d24, [x14, #0x80]\n"
+ "ldr d24, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr d30, [x14, #0x90]\n"
+ "ldr d30, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr x22, [x14, #0x98]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr d29, [x14, #0xa0]\n"
- "ldr x21, [x14, #0xa8]\n"
+ "ldr d29, [x12, #0xa0]\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr d28, [x14, #0xb0]\n"
- "ldr x20, [x14, #0xb8]\n"
+ "ldr d28, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr d27, [x14, #0xc0]\n"
+ "ldr d27, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
"mov v24.d[1], x23\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr d26, [x14, #0xd0]\n"
+ "ldr d26, [x12, #0xd0]\n"
".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n"
"mov v30.d[1], x22\n"
".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n"
- "ldr d25, [x14, #0xe0]\n"
+ "ldr d25, [x12, #0xe0]\n"
"mov v29.d[1], x21\n"
- "ldr x23, [x14, #0xc8]\n"
+ "ldr x23, [x12, #0xc8]\n"
"mov v28.d[1], x20\n"
- "ldr x22, [x14, #0xd8]\n"
- "ldr x21, [x14, #0xe8]\n"
+ "ldr x22, [x12, #0xd8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n"
- "ldr d24, [x14, #0xf0]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr d24, [x12, #0xf0]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
"mov v27.d[1], x23\n"
@@ -494,9 +494,9 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n"
"mov v24.d[1], x20\n"
".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n"
- "add x10, x10, #0x10\n"
"add x9, x9, #0x10\n"
- "add x14, x14, #0x100\n"
+ "add x28, x28, #0x10\n"
+ "add x12, x12, #0x100\n"
".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n"
".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n"
".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n"
@@ -509,53 +509,53 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"38:" // Height 2: Multiply loop: unique 5: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"bge 37b\n"
"39:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q25, [x14, #0x70]\n"
+ "ldr q25, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q24, [x14, #0x80]\n"
+ "ldr q24, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q30, [x14, #0x90]\n"
+ "ldr q30, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q29, [x14, #0xa0]\n"
+ "ldr q29, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q28, [x14, #0xb0]\n"
+ "ldr q28, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q27, [x14, #0xc0]\n"
+ "ldr q27, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q26, [x14, #0xd0]\n"
+ "ldr q26, [x12, #0xd0]\n"
".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n"
- "ldr q25, [x14, #0xe0]\n"
+ "ldr q25, [x12, #0xe0]\n"
".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n"
- "ldr q24, [x14, #0xf0]\n"
+ "ldr q24, [x12, #0xf0]\n"
".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n"
@@ -573,29 +573,29 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"40:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
"prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"41:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 48f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 48f\n"
+ "cmp x10, #0x4\n"
"blt 44f\n"
"42:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
"tbnz %x[flags], #31, 43f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q27, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q26, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q25, [x14, #0x20]\n"
- "ldr q24, [x14, #0x30]\n"
+ "ldr q27, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q26, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q25, [x12, #0x20]\n"
".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
@@ -603,44 +603,44 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"bge 42b\n"
"44:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x11, 48f\n"
- "tbz x11, #1, 45f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "tbz x11, #0, 46f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x9]\n"
+ "cbz x10, 48f\n"
+ "tbz x10, #1, 45f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x10, #0, 46f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
"b 46f\n"
"45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
"46:" // Height 2: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 47f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q27, [x14, #0x0]\n"
- "ldr q26, [x14, #0x10]\n"
- "ldr q25, [x14, #0x20]\n"
- "ldr q24, [x14, #0x30]\n"
- ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n"
- ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
+ "ldr q24, [x12, #0x0]\n"
+ ".inst 0x4f80e310 // sdot v16.4s, v24.16b, v0.4b[0]\n"
+ "ldr q26, [x12, #0x10]\n"
+ ".inst 0x4f81e314 // sdot v20.4s, v24.16b, v1.4b[0]\n"
+ "ldr q25, [x12, #0x20]\n"
".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20\n"
"prfm pstl1keep, [x13, #0x0]\n"
- "add x24, x13, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -652,28 +652,28 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"mul v11.4s, v11.4s, v24.4s\n"
"mul v12.4s, v12.4s, v24.4s\n"
"49:" // Height 2: skip row sum fixup
- "ldr q28, [x16, #0x0]\n"
+ "ldr q27, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q27, [x16, #0x10]\n"
+ "ldr q26, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q26, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q25, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add v16.4s, v16.4s, v28.4s\n"
- "add v17.4s, v17.4s, v27.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
+ "add v16.4s, v16.4s, v27.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"ld1r { v24.4s }, [x20]\n"
- "add v19.4s, v19.4s, v25.4s\n"
- "add v20.4s, v20.4s, v28.4s\n"
- "add v21.4s, v21.4s, v27.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v25.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
@@ -685,31 +685,31 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"sqrdmulh v21.4s, v21.4s, v24.4s\n"
"sqrdmulh v22.4s, v22.4s, v24.4s\n"
"sqrdmulh v23.4s, v23.4s, v24.4s\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 50f\n"
"and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
"and v30.16b, v17.16b, v0.16b\n"
"and v29.16b, v18.16b, v0.16b\n"
"and v28.16b, v19.16b, v0.16b\n"
"and v27.16b, v20.16b, v0.16b\n"
"and v26.16b, v21.16b, v0.16b\n"
"and v25.16b, v22.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v24.16b, v23.16b, v0.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v27.4s, v27.4s, #0x1f\n"
"sshr v26.4s, v26.4s, #0x1f\n"
"sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v24.4s\n"
- "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v17.4s, v17.4s, v30.4s\n"
"sqadd v18.4s, v18.4s, v29.4s\n"
"sqadd v19.4s, v19.4s, v28.4s\n"
"sqadd v20.4s, v20.4s, v27.4s\n"
"sqadd v21.4s, v21.4s, v26.4s\n"
"sqadd v22.4s, v22.4s, v25.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
@@ -721,28 +721,27 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x20]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v24.4s\n"
+ "add v18.4s, v18.4s, v24.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v24.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v24.4s\n"
+ "smin v18.4s, v18.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v24.4s\n"
+ "smin v22.4s, v22.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v24.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v25.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
- "add v16.4s, v16.4s, v26.4s\n"
- "add v17.4s, v17.4s, v26.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v26.4s\n"
- "add v20.4s, v20.4s, v26.4s\n"
- "add v21.4s, v21.4s, v26.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v26.4s\n"
- "smin v16.4s, v16.4s, v25.4s\n"
- "smin v17.4s, v17.4s, v25.4s\n"
- "smin v18.4s, v18.4s, v25.4s\n"
- "smin v19.4s, v19.4s, v25.4s\n"
- "smin v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v25.4s\n"
- "smin v22.4s, v22.4s, v25.4s\n"
- "smin v23.4s, v23.4s, v25.4s\n"
"smax v16.4s, v16.4s, v24.4s\n"
"smax v17.4s, v17.4s, v24.4s\n"
"smax v18.4s, v18.4s, v24.4s\n"
@@ -755,87 +754,88 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
"uzp1 v17.8h, v22.8h, v23.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v18.16b\n"
"uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 59f\n"
- "tbz x15, #3, 54f\n"
+ "tbz x14, #3, 54f\n"
"str d16, [x13], #0x8\n"
- "str d20, [x24], #0x8\n"
- "tbz x15, #2, 52f\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x14, #2, 52f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "tbz x15, #1, 51f\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x14, #1, 51f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[14], [x13]\n"
- "st1 { v20.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 58f\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[12], [x13]\n"
- "st1 { v20.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 53f\n"
+ "tbz x14, #1, 53f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[10], [x13]\n"
- "st1 { v20.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 58f\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[8], [x13]\n"
- "st1 { v20.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 56f\n"
+ "tbz x14, #2, 56f\n"
"str s16, [x13], #0x4\n"
- "str s20, [x24], #0x4\n"
- "tbz x15, #1, 55f\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x14, #1, 55f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[6], [x13]\n"
- "st1 { v20.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 58f\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[4], [x13]\n"
- "st1 { v20.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 57f\n"
+ "tbz x14, #1, 57f\n"
"str h16, [x13], #0x2\n"
- "str h20, [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "str h20, [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[2], [x13]\n"
- "st1 { v20.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
"str b16, [x13, #0x0]\n"
- "str b20, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
- "str q20, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 32b\n"
"b 122f\n"
"61:" // Height 3
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v15.16b, #0x1\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -850,105 +850,105 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
"63:" // Height 3: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 65f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x28, [x20, #0x10]\n"
- "cbnz x12, 66f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "cbnz x11, 66f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
"add x9, x9, x20\n"
"add x28, x28, x20\n"
+ "add x27, x27, x20\n"
"b 66f\n"
"65:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x9, x10, x21\n"
+ "mov x9, %x[input_ptr]\n"
"add x28, x9, x21\n"
+ "add x27, x28, x21\n"
"66:" // Height 3: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 71f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 69f\n"
"67:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x23, [x14, #0x88]\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr d29, [x14, #0x70]\n"
+ "ldr d29, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr x22, [x14, #0x98]\n"
+ "mov v29.d[1], x20\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x21, [x14, #0xa8]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr d28, [x14, #0x80]\n"
+ "ldr d28, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "mov v29.d[1], x20\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr d5, [x14, #0x90]\n"
+ "ldr d5, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
"mov v28.d[1], x23\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr x23, [x14, #0xc8]\n"
+ "mov v5.d[1], x22\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr d4, [x14, #0xa0]\n"
+ "ldr d4, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "mov v5.d[1], x22\n"
+ "mov v4.d[1], x21\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr x22, [x14, #0xd8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr d3, [x14, #0xb0]\n"
+ "ldr d3, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "mov v4.d[1], x21\n"
+ "mov v3.d[1], x20\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr x21, [x14, #0xe8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr d31, [x14, #0xc0]\n"
+ "ldr d31, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "mov v3.d[1], x20\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr d30, [x14, #0xd0]\n"
+ "ldr d30, [x12, #0xd0]\n"
".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n"
"mov v31.d[1], x23\n"
".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n"
- "add x10, x10, #0x10\n"
+ "mov v30.d[1], x22\n"
".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n"
- "ldr d29, [x14, #0xe0]\n"
+ "ldr d29, [x12, #0xe0]\n"
".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
- "mov v30.d[1], x22\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n"
"add x9, x9, #0x10\n"
".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n"
- "ldr d28, [x14, #0xf0]\n"
+ "ldr d28, [x12, #0xf0]\n"
".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n"
- "mov v29.d[1], x21\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n"
"add x28, x28, #0x10\n"
".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "add x27, x27, #0x10\n"
".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n"
".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n"
".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n"
@@ -971,65 +971,65 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"68:" // Height 3: Multiply loop: unique 9: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 67b\n"
"69:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q29, [x14, #0x70]\n"
+ "ldr q29, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
- ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
"add x28, x28, #0x10\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "add x27, x27, #0x10\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q28, [x14, #0x80]\n"
+ "ldr q28, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q5, [x14, #0x90]\n"
+ "ldr q5, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q4, [x14, #0xa0]\n"
+ "ldr q4, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q3, [x14, #0xb0]\n"
+ "ldr q3, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q31, [x14, #0xc0]\n"
+ "ldr q31, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q30, [x14, #0xd0]\n"
+ "ldr q30, [x12, #0xd0]\n"
".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n"
- "ldr q29, [x14, #0xe0]\n"
+ "ldr q29, [x12, #0xe0]\n"
".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n"
- "ldr q28, [x14, #0xf0]\n"
+ "ldr q28, [x12, #0xf0]\n"
".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n"
".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n"
@@ -1055,32 +1055,32 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"70:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"71:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 78f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 78f\n"
+ "cmp x10, #0x4\n"
"blt 74f\n"
"72:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x28], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
"tbnz %x[flags], #31, 73f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q31, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q30, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q29, [x14, #0x20]\n"
- "ldr q28, [x14, #0x30]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q30, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q29, [x12, #0x20]\n"
".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
@@ -1092,36 +1092,36 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n"
"bge 72b\n"
"74:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x11, 78f\n"
- "tbz x11, #1, 75f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x28], #0x2\n"
- "tbz x11, #0, 76f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x28]\n"
+ "cbz x10, 78f\n"
+ "tbz x10, #1, 75f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "tbz x10, #0, 76f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
"b 76f\n"
"75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x28, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
"76:" // Height 3: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 77f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q31, [x14, #0x0]\n"
- "ldr q30, [x14, #0x10]\n"
- "ldr q29, [x14, #0x20]\n"
- "ldr q28, [x14, #0x30]\n"
- ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
- ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
- ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ ".inst 0x4f80e390 // sdot v16.4s, v28.16b, v0.4b[0]\n"
+ "ldr q30, [x12, #0x10]\n"
+ ".inst 0x4f81e394 // sdot v20.4s, v28.16b, v1.4b[0]\n"
+ "ldr q29, [x12, #0x20]\n"
+ ".inst 0x4f82e398 // sdot v24.4s, v28.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n"
@@ -1131,15 +1131,15 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n"
"78:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x13, #0x0]\n"
- "add x24, x13, x20\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -1154,13 +1154,13 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"mul v12.4s, v12.4s, v28.4s\n"
"mul v13.4s, v13.4s, v28.4s\n"
"79:" // Height 3: skip row sum fixup
- "ldr q31, [x16, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q30, [x16, #0x10]\n"
+ "ldr q30, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q29, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q28, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1183,11 +1183,10 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"add v26.4s, v26.4s, v29.4s\n"
"add v27.4s, v27.4s, v28.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v28.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
- "add x16, x16, #0x40\n"
"sqrdmulh v16.4s, v16.4s, v28.4s\n"
"sqrdmulh v17.4s, v17.4s, v28.4s\n"
"sqrdmulh v18.4s, v18.4s, v28.4s\n"
@@ -1200,38 +1199,39 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 80f\n"
"and v1.16b, v16.16b, v0.16b\n"
"and v31.16b, v17.16b, v0.16b\n"
"and v30.16b, v18.16b, v0.16b\n"
"and v29.16b, v19.16b, v0.16b\n"
"and v28.16b, v20.16b, v0.16b\n"
- "and v3.16b, v21.16b, v0.16b\n"
- "and v2.16b, v22.16b, v0.16b\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v31.4s, v31.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v1.4s\n"
"sqadd v17.4s, v17.4s, v31.4s\n"
"sqadd v18.4s, v18.4s, v30.4s\n"
"sqadd v19.4s, v19.4s, v29.4s\n"
"sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
"and v1.16b, v23.16b, v0.16b\n"
"and v31.16b, v24.16b, v0.16b\n"
"and v30.16b, v25.16b, v0.16b\n"
"and v29.16b, v26.16b, v0.16b\n"
"and v28.16b, v27.16b, v0.16b\n"
- "sqadd v21.4s, v21.4s, v3.4s\n"
- "sqadd v22.4s, v22.4s, v2.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v31.4s, v31.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v3.4s\n"
+ "sqadd v22.4s, v22.4s, v2.4s\n"
"sqadd v23.4s, v23.4s, v1.4s\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"sqadd v25.4s, v25.4s, v30.4s\n"
@@ -1251,36 +1251,35 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x20]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v28.4s\n"
+ "add v18.4s, v18.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v23.4s, v23.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v28.4s }, [x20]\n"
- "add v16.4s, v16.4s, v30.4s\n"
- "add v17.4s, v17.4s, v30.4s\n"
- "add v18.4s, v18.4s, v30.4s\n"
- "add v19.4s, v19.4s, v30.4s\n"
- "add v20.4s, v20.4s, v30.4s\n"
- "add v21.4s, v21.4s, v30.4s\n"
- "add v22.4s, v22.4s, v30.4s\n"
- "add v23.4s, v23.4s, v30.4s\n"
- "add v24.4s, v24.4s, v30.4s\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "add v27.4s, v27.4s, v30.4s\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v19.4s, v19.4s, v29.4s\n"
- "smin v20.4s, v20.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v29.4s\n"
- "smin v22.4s, v22.4s, v29.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v24.4s, v24.4s, v29.4s\n"
- "smin v25.4s, v25.4s, v29.4s\n"
- "smin v26.4s, v26.4s, v29.4s\n"
- "smin v27.4s, v27.4s, v29.4s\n"
"smax v16.4s, v16.4s, v28.4s\n"
"smax v17.4s, v17.4s, v28.4s\n"
"smax v18.4s, v18.4s, v28.4s\n"
@@ -1299,109 +1298,109 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
"uzp1 v17.8h, v26.8h, v27.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v18.16b\n"
"uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 89f\n"
- "tbz x15, #3, 84f\n"
+ "tbz x14, #3, 84f\n"
"str d16, [x13], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x15, #2, 82f\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x14, #2, 82f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x15, #1, 81f\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x14, #1, 81f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[14], [x13]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 88f\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[12], [x13]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 83f\n"
+ "tbz x14, #1, 83f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[10], [x13]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 88f\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[8], [x13]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 86f\n"
+ "tbz x14, #2, 86f\n"
"str s16, [x13], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x15, #1, 85f\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x14, #1, 85f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[6], [x13]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 88f\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[4], [x13]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 87f\n"
+ "tbz x14, #1, 87f\n"
"str h16, [x13], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[2], [x13]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
"str b16, [x13, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 62b\n"
"b 122f\n"
"91:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x4\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
"movi v12.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "madd x20, x21, x20, x13\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v14.4s, #0x0\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -1420,118 +1419,118 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
"93:" // Height 4: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 95f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x28, [x20, #0x10]\n"
- "ldr x27, [x20, #0x18]\n"
- "cbnz x12, 96f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x26, [x20, #0x18]\n"
+ "cbnz x11, 96f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
"add x9, x9, x20\n"
"add x28, x28, x20\n"
"add x27, x27, x20\n"
+ "add x26, x26, x20\n"
"b 96f\n"
"95:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x9, x10, x21\n"
+ "mov x9, %x[input_ptr]\n"
"add x28, x9, x21\n"
"add x27, x28, x21\n"
+ "add x26, x27, x21\n"
"96:" // Height 4: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 101f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q3, [x27, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 99f\n"
"97:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x21, [x14, #0x78]\n"
+ "ldr x22, [x12, #0x78]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x20, [x14, #0x88]\n"
+ "ldr x21, [x12, #0x88]\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr x26, [x14, #0x98]\n"
+ "ldr x20, [x12, #0x98]\n"
".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr d4, [x14, #0x70]\n"
+ "ldr d4, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr x25, [x14, #0xa8]\n"
+ "mov v4.d[1], x22\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x24, [x14, #0xb8]\n"
+ "ldr x25, [x12, #0xa8]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "mov v4.d[1], x21\n"
+ "ldr x24, [x12, #0xb8]\n"
".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr d5, [x14, #0x80]\n"
+ "ldr d5, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr x23, [x14, #0xc8]\n"
+ "mov v5.d[1], x21\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x22, [x14, #0xd8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "mov v5.d[1], x20\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x14, #0x90]\n"
+ "ldr d6, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr x21, [x14, #0xe8]\n"
+ "mov v6.d[1], x20\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x26\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x14, #0xa0]\n"
+ "ldr d7, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "add x10, x10, #0x10\n"
+ "mov v7.d[1], x25\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
"add x9, x9, #0x10\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "mov v7.d[1], x25\n"
+ "add x28, x28, #0x10\n"
".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr d8, [x14, #0xb0]\n"
+ "ldr d8, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "add x28, x28, #0x10\n"
+ "mov v8.d[1], x24\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
"add x27, x27, #0x10\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "mov v8.d[1], x24\n"
+ "add x26, x26, #0x10\n"
".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr d9, [x14, #0xc0]\n"
+ "ldr d9, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "mov v9.d[1], x23\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "mov v9.d[1], x23\n"
".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr d10, [x14, #0xd0]\n"
+ "ldr d10, [x12, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ "mov v10.d[1], x22\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
- "mov v10.d[1], x22\n"
".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr d4, [x14, #0xe0]\n"
+ "ldr d4, [x12, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ "mov v4.d[1], x21\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
- "mov v4.d[1], x21\n"
".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr d5, [x14, #0xf0]\n"
+ "ldr d5, [x12, #0xf0]\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "mov v5.d[1], x20\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
- "mov v5.d[1], x20\n"
".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
@@ -1563,77 +1562,77 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"98:" // Height 4: Multiply loop: unique 13: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q3, [x27, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
"prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 97b\n"
"99:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
"add x9, x9, #0x10\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q4, [x14, #0x70]\n"
+ "ldr q4, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
"add x27, x27, #0x10\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x14, #0x80]\n"
+ "ldr q5, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x90]\n"
+ "ldr q6, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0xa0]\n"
+ "ldr q7, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x14, #0xb0]\n"
+ "ldr q8, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x14, #0xc0]\n"
+ "ldr q9, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x14, #0xd0]\n"
+ "ldr q10, [x12, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x14, #0xe0]\n"
+ "ldr q4, [x12, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x14, #0xf0]\n"
+ "ldr q5, [x12, #0xf0]\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1667,35 +1666,35 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"100:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
"prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"101:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 108f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 108f\n"
+ "cmp x10, #0x4\n"
"blt 104f\n"
"102:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x28], #0x4\n"
- "ldr s3, [x27], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
"tbnz %x[flags], #31, 103f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q7, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q6, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q5, [x14, #0x20]\n"
- "ldr q4, [x14, #0x30]\n"
+ "ldr q7, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q5, [x12, #0x20]\n"
".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
@@ -1711,23 +1710,23 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
"bge 102b\n"
"104:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x11, 108f\n"
- "tbz x11, #1, 105f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x28], #0x2\n"
- "ldr h3, [x27], #0x2\n"
- "tbz x11, #0, 106f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x28]\n"
- "ld1 { v3.b }[2], [x27]\n"
+ "cbz x10, 108f\n"
+ "tbz x10, #1, 105f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v3.b }[2], [x26]\n"
"b 106f\n"
"105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x28, #0x0]\n"
- "ldr b3, [x27, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
+ "ldr b3, [x26, #0x0]\n"
"106:" // Height 4: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 107f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
@@ -1735,16 +1734,16 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q7, [x14, #0x0]\n"
- "ldr q6, [x14, #0x10]\n"
- "ldr q5, [x14, #0x20]\n"
- "ldr q4, [x14, #0x30]\n"
+ "ldr q7, [x12, #0x0]\n"
".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x12, #0x10]\n"
".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
+ "ldr q5, [x12, #0x20]\n"
".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
@@ -1758,17 +1757,17 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
"108:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "add x24, x13, x20\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x13, x20\n"
"add x22, x23, x20\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -1776,9 +1775,9 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"addp v14.4s, v14.4s, v14.4s\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1r { v0.4s }, [x20]\n"
+ "neg v0.4s, v0.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "neg v0.4s, v0.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
"mul v11.4s, v11.4s, v0.4s\n"
@@ -1786,13 +1785,13 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"mul v13.4s, v13.4s, v0.4s\n"
"mul v14.4s, v14.4s, v0.4s\n"
"109:" // Height 4: skip row sum fixup
- "ldr q3, [x16, #0x0]\n"
+ "ldr q3, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q2, [x16, #0x10]\n"
+ "ldr q2, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q1, [x16, #0x20]\n"
+ "ldr q1, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q0, [x16, #0x30]\n"
+ "ldr q0, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1823,11 +1822,10 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"add v30.4s, v30.4s, v1.4s\n"
"add v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v1.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
- "add x16, x16, #0x40\n"
"sqrdmulh v16.4s, v16.4s, v1.4s\n"
"sqrdmulh v17.4s, v17.4s, v1.4s\n"
"sqrdmulh v18.4s, v18.4s, v1.4s\n"
@@ -1844,51 +1842,52 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 110f\n"
"and v2.16b, v16.16b, v0.16b\n"
"and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
"and v7.16b, v18.16b, v0.16b\n"
"and v6.16b, v19.16b, v0.16b\n"
"and v5.16b, v20.16b, v0.16b\n"
"and v4.16b, v21.16b, v0.16b\n"
"and v3.16b, v22.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v2.4s\n"
- "sqadd v17.4s, v17.4s, v1.4s\n"
- "and v2.16b, v23.16b, v0.16b\n"
- "and v1.16b, v24.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
"sqadd v18.4s, v18.4s, v7.4s\n"
"sqadd v19.4s, v19.4s, v6.4s\n"
"sqadd v20.4s, v20.4s, v5.4s\n"
"sqadd v21.4s, v21.4s, v4.4s\n"
"sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
"and v7.16b, v25.16b, v0.16b\n"
"and v6.16b, v26.16b, v0.16b\n"
"and v5.16b, v27.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"and v4.16b, v28.16b, v0.16b\n"
"and v3.16b, v29.16b, v0.16b\n"
+ "and v2.16b, v30.16b, v0.16b\n"
+ "and v1.16b, v31.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "sqadd v24.4s, v24.4s, v1.4s\n"
- "and v2.16b, v30.16b, v0.16b\n"
- "and v1.16b, v31.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
"sqadd v25.4s, v25.4s, v7.4s\n"
"sqadd v26.4s, v26.4s, v6.4s\n"
"sqadd v27.4s, v27.4s, v5.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v4.4s\n"
"sqadd v29.4s, v29.4s, v3.4s\n"
"sqadd v30.4s, v30.4s, v2.4s\n"
@@ -1911,44 +1910,43 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v30.4s, v30.4s, v0.4s\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v2.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v0.4s\n"
+ "add v18.4s, v18.4s, v0.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v0.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v1.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v0.4s }, [x20]\n"
- "add v16.4s, v16.4s, v2.4s\n"
- "add v17.4s, v17.4s, v2.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v2.4s\n"
- "add v20.4s, v20.4s, v2.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v2.4s\n"
- "add v24.4s, v24.4s, v2.4s\n"
- "add v25.4s, v25.4s, v2.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v2.4s\n"
- "add v28.4s, v28.4s, v2.4s\n"
- "add v29.4s, v29.4s, v2.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v2.4s\n"
- "smin v16.4s, v16.4s, v1.4s\n"
- "smin v17.4s, v17.4s, v1.4s\n"
- "smin v18.4s, v18.4s, v1.4s\n"
- "smin v19.4s, v19.4s, v1.4s\n"
- "smin v20.4s, v20.4s, v1.4s\n"
- "smin v21.4s, v21.4s, v1.4s\n"
- "smin v22.4s, v22.4s, v1.4s\n"
- "smin v23.4s, v23.4s, v1.4s\n"
- "smin v24.4s, v24.4s, v1.4s\n"
- "smin v25.4s, v25.4s, v1.4s\n"
- "smin v26.4s, v26.4s, v1.4s\n"
- "smin v27.4s, v27.4s, v1.4s\n"
- "smin v28.4s, v28.4s, v1.4s\n"
- "smin v29.4s, v29.4s, v1.4s\n"
- "smin v30.4s, v30.4s, v1.4s\n"
- "smin v31.4s, v31.4s, v1.4s\n"
"smax v16.4s, v16.4s, v0.4s\n"
"smax v17.4s, v17.4s, v0.4s\n"
"smax v18.4s, v18.4s, v0.4s\n"
@@ -1973,109 +1971,110 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
"uzp1 v17.8h, v30.8h, v31.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v0.16b\n"
"uzp1 v20.16b, v20.16b, v19.16b\n"
"uzp1 v24.16b, v24.16b, v18.16b\n"
"uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 119f\n"
- "tbz x15, #3, 114f\n"
+ "tbz x14, #3, 114f\n"
"str d16, [x13], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
- "tbz x15, #2, 112f\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x14, #2, 112f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
- "tbz x15, #1, 111f\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x14, #1, 111f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[14], [x13]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 118f\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[12], [x13]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 113f\n"
+ "tbz x14, #1, 113f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[10], [x13]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 118f\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[8], [x13]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 116f\n"
+ "tbz x14, #2, 116f\n"
"str s16, [x13], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
- "tbz x15, #1, 115f\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x14, #1, 115f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[6], [x13]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 118f\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[4], [x13]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 117f\n"
+ "tbz x14, #1, 117f\n"
"str h16, [x13], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[2], [x13]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
"str b16, [x13, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 92b\n"
"subs %x[M], %x[M], #0x4\n"
"beq 122f\n"
@@ -2089,9 +2088,9 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
index 75e35a3e98..0c3470ec16 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -45,18 +45,18 @@ void a64_hybrid_s8qa_dot_4x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -90,7 +90,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -100,8 +100,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -134,7 +134,6 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q26, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
"ldr q25, [x28, #0xa0]\n"
- "add x24, x24, #0x10\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
"ldr q24, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
@@ -145,10 +144,11 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q21, [x28, #0xe0]\n"
".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
"ldr q20, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
@@ -159,9 +159,9 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q4, [x28, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q5, [x28, #0x10]\n"
"ldr q6, [x28, #0x20]\n"
- "cmp x25, #0x20\n"
"ldr q7, [x28, #0x30]\n"
"ldr q8, [x28, #0x40]\n"
"ldr q9, [x28, #0x50]\n"
@@ -177,8 +177,6 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q26, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
"ldr q25, [x28, #0xa0]\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
"ldr q24, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
@@ -189,10 +187,12 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q21, [x28, #0xe0]\n"
".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
"ldr q20, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
+ "sub x25, x25, #0x10\n"
".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
+ "add x24, x24, #0x10\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
@@ -213,14 +213,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q23, [x28, #0x0]\n"
"ldr q22, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q21, [x28, #0x20]\n"
"ldr q20, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n"
".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n"
".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
"cbz x25, 18f\n"
@@ -235,15 +235,15 @@ void a64_hybrid_s8qa_dot_4x16 (
"tbnz %x[flags], #31, 17f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q23, [x28, #0x0]\n"
- "ldr q22, [x28, #0x10]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ ".inst 0x4f80e2b0 // sdot v16.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f80e291 // sdot v17.4s, v20.16b, v0.4b[0]\n"
"ldr q21, [x28, #0x20]\n"
"ldr q20, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n"
- ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n"
".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -252,8 +252,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"prfm pstl1keep, [x27, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
"add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
"ld1r { v20.4s }, [x20]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
"neg v20.4s, v20.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"mul v11.4s, v11.4s, v20.4s\n"
@@ -267,16 +267,16 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v20.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add v16.4s, v16.4s, v24.4s\n"
"add v17.4s, v17.4s, v23.4s\n"
+ "add v18.4s, v18.4s, v22.4s\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "add x10, x10, #0x40\n"
"ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v22.4s\n"
"add v19.4s, v19.4s, v21.4s\n"
"sqrdmulh v16.4s, v16.4s, v20.4s\n"
+ "add x10, x10, #0x40\n"
"sqrdmulh v17.4s, v17.4s, v20.4s\n"
"sqrdmulh v18.4s, v18.4s, v20.4s\n"
"sqrdmulh v19.4s, v19.4s, v20.4s\n"
@@ -294,21 +294,21 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqadd v18.4s, v18.4s, v21.4s\n"
"sqadd v19.4s, v19.4s, v20.4s\n"
"20:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x21]\n"
- "ld1r { v21.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v20.4s }, [x20]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add v16.4s, v16.4s, v22.4s\n"
"add v17.4s, v17.4s, v22.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
"add v18.4s, v18.4s, v22.4s\n"
"add v19.4s, v19.4s, v22.4s\n"
+ "cmp x9, #0x10\n"
"smin v16.4s, v16.4s, v21.4s\n"
"smin v17.4s, v17.4s, v21.4s\n"
"smin v18.4s, v18.4s, v21.4s\n"
@@ -381,7 +381,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -395,8 +395,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"mov x26, #0x0\n"
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 35f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -476,9 +476,9 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q1, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q4, [x28, #0x0]\n"
"ldr q5, [x28, #0x10]\n"
- "cmp x25, #0x20\n"
"ldr q6, [x28, #0x20]\n"
"ldr q7, [x28, #0x30]\n"
"ldr q8, [x28, #0x40]\n"
@@ -553,14 +553,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q27, [x28, #0x0]\n"
"ldr q26, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q25, [x28, #0x20]\n"
"ldr q24, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n"
".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n"
".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
@@ -583,17 +583,17 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q27, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x0]\n"
"ldr q26, [x28, #0x10]\n"
+ ".inst 0x4f80e310 // sdot v16.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e314 // sdot v20.4s, v24.16b, v1.4b[0]\n"
"ldr q25, [x28, #0x20]\n"
"ldr q24, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n"
- ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n"
".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
@@ -602,14 +602,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"cmp x26, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v24.4s }, [x20]\n"
"neg v24.4s, v24.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -627,10 +627,10 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v24.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v16.4s, v16.4s, v28.4s\n"
"add v17.4s, v17.4s, v27.4s\n"
@@ -652,45 +652,45 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqrdmulh v23.4s, v23.4s, v24.4s\n"
"tbz %x[flags], #5, 50f\n"
"and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
"and v30.16b, v17.16b, v0.16b\n"
"and v29.16b, v18.16b, v0.16b\n"
"and v28.16b, v19.16b, v0.16b\n"
"and v27.16b, v20.16b, v0.16b\n"
"and v26.16b, v21.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"and v25.16b, v22.16b, v0.16b\n"
+ "and v24.16b, v23.16b, v0.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v24.4s\n"
- "and v24.16b, v23.16b, v0.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
"sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v17.4s, v17.4s, v30.4s\n"
"sqadd v18.4s, v18.4s, v29.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v19.4s, v19.4s, v28.4s\n"
"sqadd v20.4s, v20.4s, v27.4s\n"
"sqadd v21.4s, v21.4s, v26.4s\n"
"sqadd v22.4s, v22.4s, v25.4s\n"
"sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"add v16.4s, v16.4s, v26.4s\n"
"add v17.4s, v17.4s, v26.4s\n"
"add v18.4s, v18.4s, v26.4s\n"
@@ -724,68 +724,68 @@ void a64_hybrid_s8qa_dot_4x16 (
"bge 59f\n"
"tbz x9, #3, 54f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x9, #2, 52f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x9, #1, 51f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 58f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 53f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 58f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 56f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 58f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 57f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 32b\n"
@@ -799,7 +799,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -817,8 +817,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"mov x26, #0x0\n"
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 65f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -920,9 +920,9 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q1, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q2, [x22, #0x0]\n"
"ldr q4, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q5, [x28, #0x10]\n"
"ldr q6, [x28, #0x20]\n"
"ldr q7, [x28, #0x30]\n"
@@ -1020,14 +1020,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q31, [x28, #0x0]\n"
"ldr q30, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q29, [x28, #0x20]\n"
"ldr q28, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
@@ -1060,15 +1060,15 @@ void a64_hybrid_s8qa_dot_4x16 (
"77:" // Height 3: Multiply loop: unique 12: skip row sum
"ldr q31, [x28, #0x0]\n"
"ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q28, [x28, #0x30]\n"
".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n"
@@ -1081,16 +1081,16 @@ void a64_hybrid_s8qa_dot_4x16 (
"cmp x26, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v28.4s }, [x20]\n"
"addp v13.4s, v13.4s, v13.4s\n"
"neg v28.4s, v28.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
@@ -1111,10 +1111,10 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v28.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
@@ -1152,18 +1152,18 @@ void a64_hybrid_s8qa_dot_4x16 (
"and v30.16b, v18.16b, v0.16b\n"
"and v29.16b, v19.16b, v0.16b\n"
"and v28.16b, v20.16b, v0.16b\n"
- "and v3.16b, v21.16b, v0.16b\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v31.4s, v31.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "and v2.16b, v22.16b, v0.16b\n"
"sqadd v16.4s, v16.4s, v1.4s\n"
"sqadd v17.4s, v17.4s, v31.4s\n"
"sqadd v18.4s, v18.4s, v30.4s\n"
"sqadd v19.4s, v19.4s, v29.4s\n"
"sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
"and v1.16b, v23.16b, v0.16b\n"
"and v31.16b, v24.16b, v0.16b\n"
"and v30.16b, v25.16b, v0.16b\n"
@@ -1184,21 +1184,21 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqadd v26.4s, v26.4s, v29.4s\n"
"sqadd v27.4s, v27.4s, v28.4s\n"
"80:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x21]\n"
- "ld1r { v29.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v28.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v24.4s, v24.4s, v0.4s\n"
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
@@ -1251,103 +1251,102 @@ void a64_hybrid_s8qa_dot_4x16 (
"bge 89f\n"
"tbz x9, #3, 84f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 82f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 81f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 88f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 83f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 88f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 86f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 85f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 88f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 87f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 62b\n"
"b 122f\n"
"91:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -1369,8 +1368,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"mov x26, #0x0\n"
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 95f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1494,9 +1493,9 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q1, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q2, [x22, #0x0]\n"
"ldr q3, [x21, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q4, [x28, #0x0]\n"
"ldr q5, [x28, #0x10]\n"
"ldr q6, [x28, #0x20]\n"
@@ -1617,14 +1616,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"ldr q7, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q5, [x28, #0x20]\n"
"ldr q4, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
@@ -1665,15 +1664,15 @@ void a64_hybrid_s8qa_dot_4x16 (
"107:" // Height 4: Multiply loop: unique 16: skip row sum
"ldr q7, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ "ldr q4, [x28, #0x30]\n"
".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
@@ -1690,18 +1689,18 @@ void a64_hybrid_s8qa_dot_4x16 (
"cmp x26, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20\n"
+ "add x21, x22, x20\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v0.4s }, [x20]\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
"neg v0.4s, v0.4s\n"
@@ -1725,10 +1724,10 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v1.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
@@ -1775,32 +1774,32 @@ void a64_hybrid_s8qa_dot_4x16 (
"tbz %x[flags], #5, 110f\n"
"and v2.16b, v16.16b, v0.16b\n"
"and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
"and v7.16b, v18.16b, v0.16b\n"
"and v6.16b, v19.16b, v0.16b\n"
"and v5.16b, v20.16b, v0.16b\n"
"and v4.16b, v21.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"and v3.16b, v22.16b, v0.16b\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v2.4s\n"
- "sqadd v17.4s, v17.4s, v1.4s\n"
- "and v2.16b, v23.16b, v0.16b\n"
- "and v1.16b, v24.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v7.4s\n"
- "sqadd v19.4s, v19.4s, v6.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
"sqadd v20.4s, v20.4s, v5.4s\n"
"sqadd v21.4s, v21.4s, v4.4s\n"
"sqadd v22.4s, v22.4s, v3.4s\n"
- "and v7.16b, v25.16b, v0.16b\n"
"sqadd v23.4s, v23.4s, v2.4s\n"
"sqadd v24.4s, v24.4s, v1.4s\n"
+ "and v7.16b, v25.16b, v0.16b\n"
"and v6.16b, v26.16b, v0.16b\n"
"and v5.16b, v27.16b, v0.16b\n"
"and v4.16b, v28.16b, v0.16b\n"
@@ -1822,21 +1821,21 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqadd v30.4s, v30.4s, v2.4s\n"
"sqadd v31.4s, v31.4s, v1.4s\n"
"110:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v3.4s }, [x21]\n"
- "ld1r { v2.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v1.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v24.4s, v24.4s, v0.4s\n"
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
@@ -1908,100 +1907,100 @@ void a64_hybrid_s8qa_dot_4x16 (
"bge 119f\n"
"tbz x9, #3, 114f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x9, #2, 112f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x9, #1, 111f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 118f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 113f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 118f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 116f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x9, #1, 115f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 118f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 117f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 92b\n"
@@ -2017,8 +2016,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index 55290826d1..7052d1cc41 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 16, 8> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
index 8f70b3dc26..722f9af535 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
@@ -45,18 +45,18 @@ void a64_hybrid_s8qa_mmla_4x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -90,7 +90,7 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -104,8 +104,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -130,7 +130,6 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q4, [x28, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
- "add x24, x24, #0x10\n"
"trn1 v0.2d, v1.2d, v27.2d\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
@@ -152,8 +151,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
+ "add x28, x28, #0x100\n"
".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
@@ -166,9 +166,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q5, [x28, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q6, [x28, #0x10]\n"
"ldr q7, [x28, #0x20]\n"
- "cmp x25, #0x20\n"
"ldr q8, [x28, #0x30]\n"
"ldr q9, [x28, #0x40]\n"
"ldr q10, [x28, #0x50]\n"
@@ -176,12 +176,10 @@ void a64_hybrid_s8qa_mmla_4x16 (
"prfm pldl1keep, [x24, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
"trn1 v0.2d, v1.2d, v24.2d\n"
- "trn2 v1.2d, v1.2d, v24.2d\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v24.2d\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
"ldr q24, [x28, #0x80]\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
@@ -198,9 +196,11 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q25, [x28, #0xe0]\n"
".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
+ "sub x25, x25, #0x10\n"
".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
+ "add x24, x24, #0x10\n"
+ "add x28, x28, #0x100\n"
".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
@@ -222,24 +222,24 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
"ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
+ ".inst 0x4e9aa414 // smmla v20.4s, v0.16b, v26.16b\n"
"ldr q27, [x28, #0x40]\n"
"ldr q26, [x28, #0x50]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n"
".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n"
".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
"cbz x25, 20f\n"
@@ -267,23 +267,23 @@ void a64_hybrid_s8qa_mmla_4x16 (
"tbnz %x[flags], #31, 19f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"19:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x4e99a410 // smmla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a414 // smmla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x4e99a412 // smmla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a416 // smmla v22.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n"
".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"20:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -297,8 +297,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov v23.16b, v16.16b\n"
"tbnz %x[flags], #31, 21f\n"
"add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
"ld1r { v16.4s }, [x20]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
"neg v16.4s, v16.4s\n"
"dup v11.4s, v11.s[0]\n"
"mul v11.4s, v11.4s, v16.4s\n"
@@ -312,16 +312,16 @@ void a64_hybrid_s8qa_mmla_4x16 (
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v16.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add v23.4s, v23.4s, v24.4s\n"
"add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "add x10, x10, #0x40\n"
"ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v21.4s\n"
"add v19.4s, v19.4s, v20.4s\n"
"sqrdmulh v23.4s, v23.4s, v16.4s\n"
+ "add x10, x10, #0x40\n"
"sqrdmulh v17.4s, v17.4s, v16.4s\n"
"sqrdmulh v18.4s, v18.4s, v16.4s\n"
"sqrdmulh v19.4s, v19.4s, v16.4s\n"
@@ -339,21 +339,21 @@ void a64_hybrid_s8qa_mmla_4x16 (
"sqadd v18.4s, v18.4s, v20.4s\n"
"sqadd v19.4s, v19.4s, v16.4s\n"
"22:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v21.4s }, [x21]\n"
- "ld1r { v20.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v16.4s }, [x20]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
"add v23.4s, v23.4s, v21.4s\n"
"add v17.4s, v17.4s, v21.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add v18.4s, v18.4s, v21.4s\n"
"add v19.4s, v19.4s, v21.4s\n"
+ "cmp x9, #0x10\n"
"smin v23.4s, v23.4s, v20.4s\n"
"smin v17.4s, v17.4s, v20.4s\n"
"smin v18.4s, v18.4s, v20.4s\n"
@@ -426,7 +426,7 @@ void a64_hybrid_s8qa_mmla_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"34:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -440,8 +440,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov x26, #0x0\n"
"36:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 37f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -471,13 +471,11 @@ void a64_hybrid_s8qa_mmla_4x16 (
"blt 41f\n"
"39:" // Height 2: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
"ldr q24, [x28, #0x80]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
"ldr q30, [x28, #0x90]\n"
".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
@@ -493,9 +491,11 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
+ "add x28, x28, #0x100\n"
".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n"
@@ -507,9 +507,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q2, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q5, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
- "cmp x25, #0x20\n"
"ldr q7, [x28, #0x20]\n"
"ldr q8, [x28, #0x30]\n"
"ldr q9, [x28, #0x40]\n"
@@ -520,14 +520,11 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bge 39b\n"
"41:" // Height 2: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
"ldr q24, [x28, #0x80]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
"ldr q30, [x28, #0x90]\n"
".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
@@ -542,11 +539,14 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q25, [x28, #0xe0]\n"
".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
+ "sub x25, x25, #0x10\n"
".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
+ "add x28, x28, #0x100\n"
".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n"
".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n"
@@ -568,24 +568,24 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"45:" // Height 2: Multiply loop: unique 7: skip row sum
"ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
+ ".inst 0x4e9aa414 // smmla v20.4s, v0.16b, v26.16b\n"
"ldr q27, [x28, #0x40]\n"
"ldr q26, [x28, #0x50]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n"
".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n"
".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"bge 44b\n"
"46:" // Height 2: Multiply loop: Skip odd blocks
"cbz x25, 52f\n"
@@ -620,23 +620,23 @@ void a64_hybrid_s8qa_mmla_4x16 (
"tbnz %x[flags], #31, 51f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"51:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x4e99a410 // smmla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a414 // smmla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x4e99a412 // smmla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a416 // smmla v22.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n"
".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"52:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -644,21 +644,21 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bne 36b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v24.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"mov v23.16b, v24.16b\n"
"tbnz %x[flags], #31, 53f\n"
"add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
"ld1r { v24.4s }, [x20]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
"neg v24.4s, v24.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
@@ -676,10 +676,10 @@ void a64_hybrid_s8qa_mmla_4x16 (
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v24.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v23.4s, v23.4s, v28.4s\n"
"add v20.4s, v20.4s, v27.4s\n"
@@ -701,45 +701,45 @@ void a64_hybrid_s8qa_mmla_4x16 (
"sqrdmulh v19.4s, v19.4s, v24.4s\n"
"tbz %x[flags], #5, 54f\n"
"and v24.16b, v23.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"and v30.16b, v20.16b, v0.16b\n"
"and v29.16b, v21.16b, v0.16b\n"
"and v28.16b, v22.16b, v0.16b\n"
"and v27.16b, v16.16b, v0.16b\n"
"and v26.16b, v17.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"and v25.16b, v18.16b, v0.16b\n"
+ "and v24.16b, v19.16b, v0.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v24.4s\n"
- "and v24.16b, v19.16b, v0.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
"sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v20.4s, v20.4s, v30.4s\n"
"sqadd v21.4s, v21.4s, v29.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v22.4s, v22.4s, v28.4s\n"
"sqadd v16.4s, v16.4s, v27.4s\n"
"sqadd v17.4s, v17.4s, v26.4s\n"
"sqadd v18.4s, v18.4s, v25.4s\n"
"sqadd v19.4s, v19.4s, v24.4s\n"
"54:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"add v23.4s, v23.4s, v26.4s\n"
"add v20.4s, v20.4s, v26.4s\n"
"add v21.4s, v21.4s, v26.4s\n"
@@ -773,68 +773,68 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bge 63f\n"
"tbz x9, #3, 58f\n"
"str d23, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x9, #2, 56f\n"
"st1 { v23.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v23.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
"b 62f\n"
"55:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 62f\n"
"st1 { v23.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
"b 62f\n"
"56:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 57f\n"
"st1 { v23.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
"b 62f\n"
"57:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 62f\n"
"st1 { v23.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
"b 62f\n"
"58:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 60f\n"
"str s23, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
"tbz x9, #1, 59f\n"
"st1 { v23.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
"b 62f\n"
"59:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 62f\n"
"st1 { v23.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
"b 62f\n"
"60:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 61f\n"
"str h23, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
"b 62f\n"
"61:" // Height 2: Partial direct writeback: partial_1_0
"str b23, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
"62:" // Height 2: Partial direct writeback: Done
"b 64f\n"
"63:" // Height 2: Full writeback
"str q23, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
"64:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 34b\n"
@@ -848,7 +848,7 @@ void a64_hybrid_s8qa_mmla_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"66:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -870,8 +870,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov x26, #0x0\n"
"68:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 69f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -905,35 +905,35 @@ void a64_hybrid_s8qa_mmla_4x16 (
"71:" // Height 3: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q14, [x28, #0x60]\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
+ "trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
+ ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n"
"ldr q4, [x28, #0x80]\n"
- ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
+ ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e8ea413 // smmla v19.4s, v0.16b, v14.16b\n"
- ".inst 0x4e8ea45b // smmla v27.4s, v2.16b, v14.16b\n"
+ ".inst 0x4e85a413 // smmla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a45b // smmla v27.4s, v2.16b, v5.16b\n"
"ldr q6, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e8ea417 // smmla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x4e8ea45f // smmla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n"
@@ -962,9 +962,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q2, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q3, [x22, #0x0]\n"
"ldr q5, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q6, [x28, #0x10]\n"
"ldr q7, [x28, #0x20]\n"
"ldr q8, [x28, #0x30]\n"
@@ -977,36 +977,36 @@ void a64_hybrid_s8qa_mmla_4x16 (
"73:" // Height 3: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q14, [x28, #0x60]\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
+ "trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
+ ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n"
"ldr q4, [x28, #0x80]\n"
- ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
+ ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "sub x25, x25, #0x10\n"
+ ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e8ea413 // smmla v19.4s, v0.16b, v14.16b\n"
- ".inst 0x4e8ea45b // smmla v27.4s, v2.16b, v14.16b\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4e85a413 // smmla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a45b // smmla v27.4s, v2.16b, v5.16b\n"
"ldr q6, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e8ea417 // smmla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x4e8ea45f // smmla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n"
@@ -1040,34 +1040,34 @@ void a64_hybrid_s8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 78f\n"
"76:" // Height 3: Multiply loop: Odd block loop
- "ldr d3, [x24], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
"ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
"ldr d1, [x22], #0x8\n"
- "trn1 v0.2d, v3.2d, v0.2d\n"
"trn1 v2.2d, v1.2d, v2.2d\n"
"tbnz %x[flags], #31, 77f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x4e83a410 // smmla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n"
"ldr q7, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ "sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a414 // smmla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
@@ -1120,24 +1120,24 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"83:" // Height 3: Multiply loop: unique 12: skip row sum
"ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ ".inst 0x4e83a414 // smmla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45c // smmla v28.4s, v2.16b, v3.16b\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n"
".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n"
@@ -1151,18 +1151,18 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bne 68b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v24.2d, v24.2d, v28.2d\n"
"uzp1 v25.2d, v25.2d, v29.2d\n"
"uzp1 v26.2d, v26.2d, v30.2d\n"
@@ -1170,9 +1170,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 85f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "ld1r { v23.4s }, [x20]\n"
"neg v23.4s, v23.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
@@ -1192,10 +1192,10 @@ void a64_hybrid_s8qa_mmla_4x16 (
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v23.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
@@ -1233,18 +1233,18 @@ void a64_hybrid_s8qa_mmla_4x16 (
"and v29.16b, v21.16b, v0.16b\n"
"and v28.16b, v22.16b, v0.16b\n"
"and v23.16b, v16.16b, v0.16b\n"
- "and v3.16b, v17.16b, v0.16b\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v23.4s, v23.4s, #0x1f\n"
- "and v2.16b, v18.16b, v0.16b\n"
"sqadd v31.4s, v31.4s, v1.4s\n"
"sqadd v20.4s, v20.4s, v30.4s\n"
"sqadd v21.4s, v21.4s, v29.4s\n"
"sqadd v22.4s, v22.4s, v28.4s\n"
"sqadd v16.4s, v16.4s, v23.4s\n"
+ "and v3.16b, v17.16b, v0.16b\n"
+ "and v2.16b, v18.16b, v0.16b\n"
"and v1.16b, v19.16b, v0.16b\n"
"and v30.16b, v24.16b, v0.16b\n"
"and v29.16b, v25.16b, v0.16b\n"
@@ -1265,21 +1265,21 @@ void a64_hybrid_s8qa_mmla_4x16 (
"sqadd v26.4s, v26.4s, v28.4s\n"
"sqadd v27.4s, v27.4s, v23.4s\n"
"86:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "ld1r { v28.4s }, [x20]\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v23.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v24.4s, v24.4s, v0.4s\n"
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
@@ -1332,103 +1332,102 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bge 95f\n"
"tbz x9, #3, 90f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 88f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 87f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 94f\n"
"87:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 94f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 94f\n"
"88:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 89f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 94f\n"
"89:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 94f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 94f\n"
"90:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 92f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 91f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 94f\n"
"91:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 94f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 94f\n"
"92:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 93f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 94f\n"
"93:" // Height 3: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"94:" // Height 3: Partial direct writeback: Done
"b 96f\n"
"95:" // Height 3: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"96:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 66b\n"
"b 130f\n"
"97:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"98:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -1450,8 +1449,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov x26, #0x0\n"
"100:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 101f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1489,28 +1488,28 @@ void a64_hybrid_s8qa_mmla_4x16 (
"103:" // Height 4: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
"ldr q4, [x28, #0x60]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
"ldr q5, [x28, #0x70]\n"
+ ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n"
"ldr q6, [x28, #0x80]\n"
+ ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
@@ -1547,9 +1546,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q2, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q3, [x22, #0x0]\n"
"ldr q4, [x21, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q5, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
"ldr q7, [x28, #0x20]\n"
@@ -1564,32 +1563,32 @@ void a64_hybrid_s8qa_mmla_4x16 (
"105:" // Height 4: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
"ldr q4, [x28, #0x60]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
"ldr q5, [x28, #0x70]\n"
+ ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n"
"ldr q6, [x28, #0x80]\n"
+ ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n"
"ldr q4, [x28, #0xd0]\n"
@@ -1629,35 +1628,35 @@ void a64_hybrid_s8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 110f\n"
"108:" // Height 4: Multiply loop: Odd block loop
- "ldr d3, [x24], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
"ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
"ldr d2, [x22], #0x8\n"
"ldr d1, [x21], #0x8\n"
- "trn1 v0.2d, v3.2d, v0.2d\n"
"trn1 v2.2d, v2.2d, v1.2d\n"
"tbnz %x[flags], #31, 109f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"109:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x4e83a410 // smmla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n"
"ldr q7, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ "sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a414 // smmla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
@@ -1717,24 +1716,24 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"115:" // Height 4: Multiply loop: unique 16: skip row sum
"ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ ".inst 0x4e83a414 // smmla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45c // smmla v28.4s, v2.16b, v3.16b\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n"
".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n"
@@ -1748,22 +1747,22 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bne 100b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
+ "add x21, x22, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
"uzp2 v24.2d, v24.2d, v28.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v28.2d, v25.2d, v29.2d\n"
"uzp2 v25.2d, v25.2d, v29.2d\n"
"uzp1 v29.2d, v26.2d, v30.2d\n"
@@ -1773,9 +1772,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 117f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "ld1r { v0.4s }, [x20]\n"
"neg v0.4s, v0.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
@@ -1783,8 +1782,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"dup v13.4s, v13.s[0]\n"
"mul v11.4s, v11.4s, v0.4s\n"
"mul v12.4s, v12.4s, v0.4s\n"
- "mul v14.4s, v14.4s, v0.4s\n"
"mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"117:" // Height 4: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
"ldr q4, [x10, #0x10]\n"
@@ -1797,10 +1796,10 @@ void a64_hybrid_s8qa_mmla_4x16 (
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v1.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v23.4s, v23.4s, v13.4s\n"
"add v28.4s, v28.4s, v13.4s\n"
@@ -1847,32 +1846,32 @@ void a64_hybrid_s8qa_mmla_4x16 (
"tbz %x[flags], #5, 118f\n"
"and v2.16b, v31.16b, v0.16b\n"
"and v1.16b, v20.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v2.4s\n"
+ "sqadd v20.4s, v20.4s, v1.4s\n"
"and v7.16b, v21.16b, v0.16b\n"
"and v6.16b, v22.16b, v0.16b\n"
"and v5.16b, v16.16b, v0.16b\n"
"and v4.16b, v17.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"and v3.16b, v18.16b, v0.16b\n"
+ "and v2.16b, v19.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v2.4s\n"
- "sqadd v20.4s, v20.4s, v1.4s\n"
- "and v2.16b, v19.16b, v0.16b\n"
- "and v1.16b, v23.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v7.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v7.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
"sqadd v16.4s, v16.4s, v5.4s\n"
"sqadd v17.4s, v17.4s, v4.4s\n"
"sqadd v18.4s, v18.4s, v3.4s\n"
- "and v7.16b, v28.16b, v0.16b\n"
"sqadd v19.4s, v19.4s, v2.4s\n"
"sqadd v23.4s, v23.4s, v1.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
"and v6.16b, v29.16b, v0.16b\n"
"and v5.16b, v30.16b, v0.16b\n"
"and v4.16b, v24.16b, v0.16b\n"
@@ -1894,21 +1893,21 @@ void a64_hybrid_s8qa_mmla_4x16 (
"sqadd v26.4s, v26.4s, v2.4s\n"
"sqadd v27.4s, v27.4s, v1.4s\n"
"118:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v3.4s }, [x21]\n"
- "ld1r { v2.4s }, [x20]\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v1.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v28.4s, v28.4s, v0.4s\n"
"srshl v29.4s, v29.4s, v0.4s\n"
@@ -1980,100 +1979,100 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bge 127f\n"
"tbz x9, #3, 122f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x9, #2, 120f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v23.s }[2], [x23], #0x4\n"
- "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
"tbz x9, #1, 119f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v23.h }[6], [x23], #0x2\n"
- "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v23.h }[6], [x22], #0x2\n"
+ "st1 { v24.h }[6], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v23.b }[14], [x23]\n"
- "st1 { v24.b }[14], [x22]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v23.b }[14], [x22]\n"
+ "st1 { v24.b }[14], [x21]\n"
"b 126f\n"
"119:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 126f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v23.b }[12], [x23]\n"
- "st1 { v24.b }[12], [x22]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v23.b }[12], [x22]\n"
+ "st1 { v24.b }[12], [x21]\n"
"b 126f\n"
"120:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 121f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v23.h }[4], [x23], #0x2\n"
- "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v23.h }[4], [x22], #0x2\n"
+ "st1 { v24.h }[4], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v23.b }[10], [x23]\n"
- "st1 { v24.b }[10], [x22]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v23.b }[10], [x22]\n"
+ "st1 { v24.b }[10], [x21]\n"
"b 126f\n"
"121:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 126f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v23.b }[8], [x23]\n"
- "st1 { v24.b }[8], [x22]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v23.b }[8], [x22]\n"
+ "st1 { v24.b }[8], [x21]\n"
"b 126f\n"
"122:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 124f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s23, [x23], #0x4\n"
- "str s24, [x22], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
"tbz x9, #1, 123f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v23.h }[2], [x23], #0x2\n"
- "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v23.h }[2], [x22], #0x2\n"
+ "st1 { v24.h }[2], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v23.b }[6], [x23]\n"
- "st1 { v24.b }[6], [x22]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v23.b }[6], [x22]\n"
+ "st1 { v24.b }[6], [x21]\n"
"b 126f\n"
"123:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 126f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v23.b }[4], [x23]\n"
- "st1 { v24.b }[4], [x22]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v23.b }[4], [x22]\n"
+ "st1 { v24.b }[4], [x21]\n"
"b 126f\n"
"124:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 125f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h23, [x23], #0x2\n"
- "str h24, [x22], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h23, [x22], #0x2\n"
+ "str h24, [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v23.b }[2], [x23]\n"
- "st1 { v24.b }[2], [x22]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "st1 { v24.b }[2], [x21]\n"
"b 126f\n"
"125:" // Height 4: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b23, [x23, #0x0]\n"
- "str b24, [x22, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b23, [x22, #0x0]\n"
+ "str b24, [x21, #0x0]\n"
"126:" // Height 4: Partial direct writeback: Done
"b 128f\n"
"127:" // Height 4: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "str q23, [x23, #0x0]\n"
- "str q24, [x22, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q24, [x21, #0x0]\n"
"128:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 98b\n"
@@ -2089,8 +2088,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"130:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
index 51057a6ffc..0ec35f7f76 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -71,7 +71,7 @@ public:
return false;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
index 559b492871..f8f6579d8c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
@@ -47,18 +47,18 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -97,9 +97,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x6, %x[col_bias]\n"
"ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -109,8 +109,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x14, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -126,118 +126,118 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"blt 9f\n"
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
- "ldr q6, [x16, #0x0]\n"
- "ldr q7, [x16, #0x10]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "ldr q7, [x15, #0x10]\n"
"blt 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr d17, [x16, #0x20]\n"
- "ldr x21, [x16, #0x28]\n"
+ "ldr d17, [x15, #0x20]\n"
+ "ldr x20, [x15, #0x28]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr d16, [x16, #0x30]\n"
- "add x12, x12, #0x10\n"
- "ldr x20, [x16, #0x38]\n"
- "sub x13, x13, #0x10\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0x48]\n"
- "ldr x22, [x12, #0x8]\n"
- "cmp x13, #0x20\n"
+ "ldr d16, [x15, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x38]\n"
"mov v16.d[1], x20\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr d17, [x16, #0x40]\n"
+ "ldr d17, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr d16, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0x68]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr d16, [x15, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x58]\n"
"mov v16.d[1], x20\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr d17, [x16, #0x60]\n"
+ "ldr d17, [x15, #0x60]\n"
+ "ldr x20, [x15, #0x68]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr d16, [x16, #0x70]\n"
- "ldr x20, [x16, #0x78]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0x88]\n"
+ "ldr d16, [x15, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x78]\n"
"mov v16.d[1], x20\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr d17, [x16, #0x80]\n"
+ "ldr d17, [x15, #0x80]\n"
+ "ldr x20, [x15, #0x88]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr d16, [x16, #0x90]\n"
- "ldr x20, [x16, #0x98]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0xa8]\n"
+ "ldr d16, [x15, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
"mov v16.d[1], x20\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr d17, [x16, #0xa0]\n"
+ "ldr d17, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xa8]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr d16, [x16, #0xb0]\n"
- "ldr x20, [x16, #0xb8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0xc8]\n"
+ "ldr d16, [x15, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xb8]\n"
"mov v16.d[1], x20\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr d17, [x16, #0xc0]\n"
+ "ldr d17, [x15, #0xc0]\n"
+ "ldr x20, [x15, #0xc8]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr d16, [x16, #0xd0]\n"
- "ldr x20, [x16, #0xd8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0xe8]\n"
+ "ldr d16, [x15, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
"mov v16.d[1], x20\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr d17, [x16, #0xe0]\n"
+ "ldr d17, [x15, #0xe0]\n"
+ "ldr x20, [x15, #0xe8]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr d16, [x16, #0xf0]\n"
- "ldr x20, [x16, #0xf8]\n"
- "add x16, x16, #0x100\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x16, #0x8]\n"
+ "ldr d16, [x15, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
"mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr d6, [x16, #0x0]\n"
+ "ldr d6, [x15, #0x0]\n"
+ "ldr x20, [x15, #0x8]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
- "ldr d7, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x22\n"
+ "sub x13, x13, #0x10\n"
+ "ldr d7, [x15, #0x10]\n"
+ "cmp x13, #0x20\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v0.d[1], x21\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x16, #0x20]\n"
+ "ldr q17, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x16, #0x30]\n"
- "add x12, x12, #0x10\n"
- "sub x13, x13, #0x10\n"
+ "ldr q16, [x15, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x16, #0x40]\n"
+ "ldr q17, [x15, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x16, #0x50]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr q16, [x15, #0x50]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x16, #0x60]\n"
+ "ldr q17, [x15, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x16, #0x70]\n"
+ "ldr q16, [x15, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x16, #0x80]\n"
+ "ldr q17, [x15, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x16, #0x90]\n"
+ "ldr q16, [x15, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x16, #0xa0]\n"
+ "ldr q17, [x15, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x16, #0xb0]\n"
+ "ldr q16, [x15, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x16, #0xc0]\n"
+ "ldr q17, [x15, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x16, #0xd0]\n"
+ "ldr q16, [x15, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x16, #0xe0]\n"
+ "ldr q17, [x15, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x16, #0xf0]\n"
- "add x16, x16, #0x100\n"
+ "ldr q16, [x15, #0xf0]\n"
+ "add x12, x12, #0x10\n"
+ "sub x13, x13, #0x10\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ "add x15, x15, #0x100\n"
"9:" // Height 1: Multiply loop: Main loop skip
"cbz x13, 14f\n"
"cmp x13, #0x4\n"
@@ -245,16 +245,16 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"10:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr q17, [x16, #0x0]\n"
- "cmp x13, #0x4\n"
- "ldr q16, [x16, #0x10]\n"
- ".inst 0x4f92e228 // sdot v8.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x16, #0x20]\n"
+ "ldr q16, [x15, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "ldr q17, [x15, #0x20]\n"
+ "cmp x13, #0x4\n"
+ "ldr q16, [x15, #0x30]\n"
".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
+ "add x15, x15, #0x40\n"
"bge 10b\n"
"11:" // Height 1: Multiply loop: Skip odd blocks
"cbz x13, 14f\n"
@@ -266,30 +266,30 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
"13:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q17, [x16, #0x0]\n"
- "ldr q16, [x16, #0x10]\n"
- ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x16, #0x20]\n"
+ "ldr q16, [x15, #0x0]\n"
+ ".inst 0x4f80e208 // sdot v8.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
- ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x20]\n"
+ ".inst 0x4f80e20a // sdot v10.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x30]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "add x15, x15, #0x40\n"
"14:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 4b\n"
- "ldr q19, [x6, #0x0]\n"
- "ldr q18, [x6, #0x10]\n"
- "ldr q17, [x6, #0x20]\n"
+ "ldr q16, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "ldr q16, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v16.4s\n"
"ldr q16, [x6, #0x30]\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "prfm pstl1keep, [x15, #0x0]\n"
- "add v9.4s, v9.4s, v18.4s\n"
- "add v10.4s, v10.4s, v17.4s\n"
- "add x6, x6, #0x40\n"
"add v11.4s, v11.4s, v16.4s\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "add x6, x6, #0x40\n"
"tbz %x[flags], #4, 15f\n"
"ldr q0, [x8, #0x0]\n"
"ldr q4, [x7, #0x0]\n"
@@ -303,9 +303,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 16f\n"
"15:" // Height 1: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -337,87 +337,87 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v16.4s\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "add v10.4s, v10.4s, v16.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v16.4s\n"
+ "smin v11.4s, v11.4s, v16.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v17.4s }, [x21]\n"
- "cmp x17, #0x10\n"
"ld1r { v16.4s }, [x20]\n"
- "add v8.4s, v8.4s, v18.4s\n"
- "add v9.4s, v9.4s, v18.4s\n"
- "add v10.4s, v10.4s, v18.4s\n"
- "add v11.4s, v11.4s, v18.4s\n"
- "smin v8.4s, v8.4s, v17.4s\n"
- "smin v9.4s, v9.4s, v17.4s\n"
- "smin v10.4s, v10.4s, v17.4s\n"
- "smin v11.4s, v11.4s, v17.4s\n"
"smax v8.4s, v8.4s, v16.4s\n"
"smax v9.4s, v9.4s, v16.4s\n"
"smax v10.4s, v10.4s, v16.4s\n"
"smax v11.4s, v11.4s, v16.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
"uzp1 v16.8h, v10.8h, v11.8h\n"
+ "cmp x16, #0x10\n"
"uzp1 v8.16b, v8.16b, v16.16b\n"
"bge 26f\n"
- "tbz x17, #3, 21f\n"
- "str d8, [x15], #0x8\n"
- "tbz x17, #2, 19f\n"
- "st1 { v8.s }[2], [x15], #0x4\n"
- "tbz x17, #1, 18f\n"
- "st1 { v8.h }[6], [x15], #0x2\n"
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[14], [x15]\n"
+ "tbz x16, #3, 21f\n"
+ "str d8, [x17], #0x8\n"
+ "tbz x16, #2, 19f\n"
+ "st1 { v8.s }[2], [x17], #0x4\n"
+ "tbz x16, #1, 18f\n"
+ "st1 { v8.h }[6], [x17], #0x2\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[14], [x17]\n"
"b 25f\n"
"18:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[12], [x15]\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[12], [x17]\n"
"b 25f\n"
"19:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x17, #1, 20f\n"
- "st1 { v8.h }[4], [x15], #0x2\n"
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[10], [x15]\n"
+ "tbz x16, #1, 20f\n"
+ "st1 { v8.h }[4], [x17], #0x2\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[10], [x17]\n"
"b 25f\n"
"20:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[8], [x15]\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[8], [x17]\n"
"b 25f\n"
"21:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x17, #2, 23f\n"
- "str s8, [x15], #0x4\n"
- "tbz x17, #1, 22f\n"
- "st1 { v8.h }[2], [x15], #0x2\n"
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[6], [x15]\n"
+ "tbz x16, #2, 23f\n"
+ "str s8, [x17], #0x4\n"
+ "tbz x16, #1, 22f\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[6], [x17]\n"
"b 25f\n"
"22:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[4], [x15]\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[4], [x17]\n"
"b 25f\n"
"23:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x17, #1, 24f\n"
- "str h8, [x15], #0x2\n"
- "tbz x17, #0, 25f\n"
- "st1 { v8.b }[2], [x15]\n"
+ "tbz x16, #1, 24f\n"
+ "str h8, [x17], #0x2\n"
+ "tbz x16, #0, 25f\n"
+ "st1 { v8.b }[2], [x17]\n"
"b 25f\n"
"24:" // Height 1: Partial direct writeback: partial_1_0
- "str b8, [x15, #0x0]\n"
+ "str b8, [x17, #0x0]\n"
"25:" // Height 1: Partial direct writeback: Done
"b 27f\n"
"26:" // Height 1: Full writeback
- "str q8, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "str q8, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
"27:" // Height 1: Writeback done
- "subs x17, x17, #0x10\n"
+ "subs x16, x16, #0x10\n"
"bgt 2b\n"
"b 164f\n"
"28:" // Height 2
"mov x6, %x[col_bias]\n"
"ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"29:" // Height 2: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -431,8 +431,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x14, #0x0\n"
"31:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 32f\n"
"ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -452,154 +452,154 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
"ldr q1, [x11, #0x0]\n"
- "ldr q6, [x16, #0x0]\n"
- "ldr q7, [x16, #0x10]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "ldr q7, [x15, #0x10]\n"
"blt 35f\n"
"34:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x16, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr d17, [x16, #0x20]\n"
+ "ldr d17, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr x21, [x16, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr d16, [x16, #0x30]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x16, #0x48]\n"
- "add x12, x12, #0x10\n"
- "add x11, x11, #0x10\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x15, #0x30]\n"
+ "mov v17.d[1], x21\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr d17, [x16, #0x40]\n"
+ "ldr d17, [x15, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr x21, [x16, #0x58]\n"
+ "ldr x20, [x15, #0x48]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr d16, [x16, #0x50]\n"
+ "ldr d16, [x15, #0x50]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x16, #0x68]\n"
- "ldr x23, [x12, #0x8]\n"
- "sub x13, x13, #0x10\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr x21, [x15, #0x68]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr d17, [x16, #0x60]\n"
+ "ldr d17, [x15, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr x21, [x16, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr d16, [x16, #0x70]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x16, #0x88]\n"
- "ldr x22, [x11, #0x8]\n"
- "cmp x13, #0x20\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x15, #0x70]\n"
+ "mov v17.d[1], x21\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr d17, [x16, #0x80]\n"
+ "ldr d17, [x15, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr x21, [x16, #0x98]\n"
+ "ldr x20, [x15, #0x88]\n"
".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr d16, [x16, #0x90]\n"
+ "ldr d16, [x15, #0x90]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x16, #0xa8]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x15, #0x98]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr x21, [x15, #0xa8]\n"
".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr d17, [x16, #0xa0]\n"
+ "ldr d17, [x15, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr x21, [x16, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr d16, [x16, #0xb0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x16, #0xc8]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x15, #0xb0]\n"
+ "mov v17.d[1], x21\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr d17, [x16, #0xc0]\n"
+ "ldr d17, [x15, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr x21, [x16, #0xd8]\n"
+ "ldr x20, [x15, #0xc8]\n"
".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr d16, [x16, #0xd0]\n"
+ "ldr d16, [x15, #0xd0]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x16, #0xe8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0xe8]\n"
".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr d17, [x16, #0xe0]\n"
+ "ldr d17, [x15, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr x21, [x16, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr d16, [x16, #0xf0]\n"
- "mov v17.d[1], x20\n"
- "add x16, x16, #0x100\n"
- "ldr x20, [x16, #0x8]\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x15, #0xf0]\n"
+ "mov v17.d[1], x21\n"
+ "add x12, x12, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x11, x11, #0x10\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
- "ldr d6, [x16, #0x0]\n"
+ "ldr d6, [x15, #0x0]\n"
+ "ldr x21, [x15, #0x8]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"ldr d1, [x11, #0x0]\n"
- "ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v0.d[1], x23\n"
- "mov v1.d[1], x22\n"
+ "sub x13, x13, #0x10\n"
+ "ldr d7, [x15, #0x10]\n"
+ "cmp x13, #0x20\n"
+ "ldr x20, [x12, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v1.d[1], x21\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"bge 34b\n"
"35:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x16, #0x20]\n"
+ "ldr q17, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x16, #0x30]\n"
+ "ldr q16, [x15, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x16, #0x40]\n"
+ "ldr q17, [x15, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x16, #0x50]\n"
+ "ldr q16, [x15, #0x50]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x16, #0x60]\n"
+ "ldr q17, [x15, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x16, #0x70]\n"
+ "ldr q16, [x15, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x16, #0x80]\n"
+ "ldr q17, [x15, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x16, #0x90]\n"
+ "ldr q16, [x15, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x16, #0xa0]\n"
+ "ldr q17, [x15, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x16, #0xb0]\n"
+ "ldr q16, [x15, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x16, #0xc0]\n"
+ "ldr q17, [x15, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x16, #0xd0]\n"
+ "ldr q16, [x15, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x16, #0xe0]\n"
+ "ldr q17, [x15, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x16, #0xf0]\n"
+ "ldr q16, [x15, #0xf0]\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
- "add x16, x16, #0x100\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
@@ -612,16 +612,16 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sub x13, x13, #0x4\n"
"ldr s18, [x11], #0x4\n"
"cmp x13, #0x4\n"
- "ldr q17, [x16, #0x0]\n"
- "ldr q16, [x16, #0x10]\n"
+ "ldr q17, [x15, #0x0]\n"
".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x16, #0x20]\n"
+ "ldr q17, [x15, #0x20]\n"
".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x16, #0x30]\n"
+ "ldr q16, [x15, #0x30]\n"
".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
@@ -639,16 +639,16 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr b0, [x12, #0x0]\n"
"ldr b1, [x11, #0x0]\n"
"40:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q17, [x16, #0x0]\n"
- "ldr q16, [x16, #0x10]\n"
+ "ldr q17, [x15, #0x0]\n"
".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x16, #0x20]\n"
+ "ldr q17, [x15, #0x20]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x16, #0x30]\n"
+ "ldr q16, [x15, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
@@ -658,19 +658,19 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x14, x20\n"
"bne 31b\n"
"ldr q19, [x6, #0x0]\n"
- "ldr q18, [x6, #0x10]\n"
- "ldr q17, [x6, #0x20]\n"
- "ldr q16, [x6, #0x30]\n"
"add v8.4s, v8.4s, v19.4s\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr q18, [x6, #0x10]\n"
"add v9.4s, v9.4s, v18.4s\n"
- "prfm pstl1keep, [x15, #0x0]\n"
+ "ldr q17, [x6, #0x20]\n"
"add v10.4s, v10.4s, v17.4s\n"
+ "ldr q16, [x6, #0x30]\n"
"add v11.4s, v11.4s, v16.4s\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x17, x20\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"add v12.4s, v12.4s, v19.4s\n"
- "add x26, x15, x20\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"add v13.4s, v13.4s, v18.4s\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"add v14.4s, v14.4s, v17.4s\n"
"add v15.4s, v15.4s, v16.4s\n"
"add x6, x6, #0x40\n"
@@ -687,9 +687,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 43f\n"
"42:" // Height 2: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -741,28 +741,27 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v16.4s\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "add v10.4s, v10.4s, v16.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
+ "add v12.4s, v12.4s, v16.4s\n"
+ "add v13.4s, v13.4s, v16.4s\n"
+ "add v14.4s, v14.4s, v16.4s\n"
+ "add v15.4s, v15.4s, v16.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v16.4s\n"
+ "smin v11.4s, v11.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v17.4s }, [x21]\n"
- "cmp x17, #0x10\n"
"ld1r { v16.4s }, [x20]\n"
- "add v8.4s, v8.4s, v18.4s\n"
- "add v9.4s, v9.4s, v18.4s\n"
- "add v10.4s, v10.4s, v18.4s\n"
- "add v11.4s, v11.4s, v18.4s\n"
- "add v12.4s, v12.4s, v18.4s\n"
- "add v13.4s, v13.4s, v18.4s\n"
- "add v14.4s, v14.4s, v18.4s\n"
- "add v15.4s, v15.4s, v18.4s\n"
- "smin v8.4s, v8.4s, v17.4s\n"
- "smin v9.4s, v9.4s, v17.4s\n"
- "smin v10.4s, v10.4s, v17.4s\n"
- "smin v11.4s, v11.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
"smax v8.4s, v8.4s, v16.4s\n"
"smax v9.4s, v9.4s, v16.4s\n"
"smax v10.4s, v10.4s, v16.4s\n"
@@ -775,84 +774,85 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"uzp1 v17.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
"uzp1 v16.8h, v14.8h, v15.8h\n"
+ "cmp x16, #0x10\n"
"uzp1 v8.16b, v8.16b, v17.16b\n"
"uzp1 v12.16b, v12.16b, v16.16b\n"
"bge 53f\n"
- "tbz x17, #3, 48f\n"
- "str d8, [x15], #0x8\n"
- "str d12, [x26], #0x8\n"
- "tbz x17, #2, 46f\n"
- "st1 { v8.s }[2], [x15], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "tbz x17, #1, 45f\n"
- "st1 { v8.h }[6], [x15], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[14], [x15]\n"
- "st1 { v12.b }[14], [x26]\n"
+ "tbz x16, #3, 48f\n"
+ "str d8, [x17], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "tbz x16, #2, 46f\n"
+ "st1 { v8.s }[2], [x17], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "tbz x16, #1, 45f\n"
+ "st1 { v8.h }[6], [x17], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[14], [x17]\n"
+ "st1 { v12.b }[14], [x25]\n"
"b 52f\n"
"45:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[12], [x15]\n"
- "st1 { v12.b }[12], [x26]\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[12], [x17]\n"
+ "st1 { v12.b }[12], [x25]\n"
"b 52f\n"
"46:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x17, #1, 47f\n"
- "st1 { v8.h }[4], [x15], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[10], [x15]\n"
- "st1 { v12.b }[10], [x26]\n"
+ "tbz x16, #1, 47f\n"
+ "st1 { v8.h }[4], [x17], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[10], [x17]\n"
+ "st1 { v12.b }[10], [x25]\n"
"b 52f\n"
"47:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[8], [x15]\n"
- "st1 { v12.b }[8], [x26]\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[8], [x17]\n"
+ "st1 { v12.b }[8], [x25]\n"
"b 52f\n"
"48:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x17, #2, 50f\n"
- "str s8, [x15], #0x4\n"
- "str s12, [x26], #0x4\n"
- "tbz x17, #1, 49f\n"
- "st1 { v8.h }[2], [x15], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[6], [x15]\n"
- "st1 { v12.b }[6], [x26]\n"
+ "tbz x16, #2, 50f\n"
+ "str s8, [x17], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "tbz x16, #1, 49f\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[6], [x17]\n"
+ "st1 { v12.b }[6], [x25]\n"
"b 52f\n"
"49:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[4], [x15]\n"
- "st1 { v12.b }[4], [x26]\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[4], [x17]\n"
+ "st1 { v12.b }[4], [x25]\n"
"b 52f\n"
"50:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x17, #1, 51f\n"
- "str h8, [x15], #0x2\n"
- "str h12, [x26], #0x2\n"
- "tbz x17, #0, 52f\n"
- "st1 { v8.b }[2], [x15]\n"
- "st1 { v12.b }[2], [x26]\n"
+ "tbz x16, #1, 51f\n"
+ "str h8, [x17], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "tbz x16, #0, 52f\n"
+ "st1 { v8.b }[2], [x17]\n"
+ "st1 { v12.b }[2], [x25]\n"
"b 52f\n"
"51:" // Height 2: Partial direct writeback: partial_1_0
- "str b8, [x15, #0x0]\n"
- "str b12, [x26, #0x0]\n"
+ "str b8, [x17, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
"52:" // Height 2: Partial direct writeback: Done
"b 54f\n"
"53:" // Height 2: Full writeback
- "str q8, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "str q12, [x26, #0x0]\n"
+ "str q8, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "str q12, [x25, #0x0]\n"
"54:" // Height 2: Writeback done
- "subs x17, x17, #0x10\n"
+ "subs x16, x16, #0x10\n"
"bgt 29b\n"
"b 164f\n"
"55:" // Height 3
"mov x6, %x[col_bias]\n"
"ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"56:" // Height 3: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -870,8 +870,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x14, #0x0\n"
"58:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -895,123 +895,123 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x13, #0x20\n"
"ldr q1, [x11, #0x0]\n"
"ldr q2, [x10, #0x0]\n"
- "ldr q6, [x16, #0x0]\n"
- "ldr q7, [x16, #0x10]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "ldr q7, [x15, #0x10]\n"
"blt 62f\n"
"61:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x16, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr d21, [x16, #0x20]\n"
+ "ldr d21, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x12, x12, #0x10\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr d20, [x16, #0x30]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x16, #0x48]\n"
- "add x10, x10, #0x10\n"
- "ldr x24, [x12, #0x8]\n"
+ "ldr d20, [x15, #0x30]\n"
"mov v20.d[1], x20\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
- "ldr x20, [x16, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr d21, [x16, #0x40]\n"
+ "ldr d21, [x15, #0x40]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
- "ldr x23, [x11, #0x8]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
- "ldr x22, [x10, #0x8]\n"
+ "ldr x21, [x15, #0x68]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr d20, [x16, #0x50]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x16, #0x68]\n"
- "sub x13, x13, #0x10\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr d20, [x15, #0x50]\n"
"mov v20.d[1], x20\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
- "ldr x20, [x16, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr d21, [x16, #0x60]\n"
+ "ldr d21, [x15, #0x60]\n"
".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
- "cmp x13, #0x20\n"
+ "mov v21.d[1], x21\n"
".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x15, #0x88]\n"
".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr d20, [x16, #0x70]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x16, #0x88]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr d20, [x15, #0x70]\n"
"mov v20.d[1], x20\n"
".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
- "ldr x20, [x16, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr d21, [x16, #0x80]\n"
+ "ldr d21, [x15, #0x80]\n"
".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0xa8]\n"
".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr d20, [x16, #0x90]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x16, #0xa8]\n"
+ "ldr d20, [x15, #0x90]\n"
"mov v20.d[1], x20\n"
".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
- "ldr x20, [x16, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr d21, [x16, #0xa0]\n"
+ "ldr d21, [x15, #0xa0]\n"
".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xc8]\n"
".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr d20, [x16, #0xb0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x16, #0xc8]\n"
+ "ldr d20, [x15, #0xb0]\n"
"mov v20.d[1], x20\n"
".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
- "ldr x20, [x16, #0xd8]\n"
+ "ldr x20, [x15, #0xd8]\n"
".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr d21, [x16, #0xc0]\n"
+ "ldr d21, [x15, #0xc0]\n"
".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xe8]\n"
".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr d20, [x16, #0xd0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x16, #0xe8]\n"
+ "ldr d20, [x15, #0xd0]\n"
"mov v20.d[1], x20\n"
".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
- "ldr x20, [x16, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr d21, [x16, #0xe0]\n"
+ "ldr d21, [x15, #0xe0]\n"
".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ "add x12, x12, #0x10\n"
".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr d20, [x16, #0xf0]\n"
- "mov v21.d[1], x21\n"
- "add x16, x16, #0x100\n"
- "ldr x21, [x16, #0x8]\n"
+ "ldr d20, [x15, #0xf0]\n"
"mov v20.d[1], x20\n"
+ "add x11, x11, #0x10\n"
+ "add x10, x10, #0x10\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ "ldr x20, [x15, #0x8]\n"
".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
- "ldr x20, [x16, #0x18]\n"
+ "ldr x23, [x12, #0x8]\n"
".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
- "ldr d6, [x16, #0x0]\n"
+ "ldr d6, [x15, #0x0]\n"
".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
"ldr d1, [x11, #0x0]\n"
+ "ldr x22, [x11, #0x8]\n"
".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"ldr d2, [x10, #0x0]\n"
- "ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x24\n"
- "mov v1.d[1], x23\n"
- "mov v2.d[1], x22\n"
+ "sub x13, x13, #0x10\n"
+ "ldr d7, [x15, #0x10]\n"
+ "cmp x13, #0x20\n"
+ "ldr x21, [x10, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v0.d[1], x23\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"mov v7.d[1], x20\n"
"bge 61b\n"
"62:" // Height 3: Multiply loop: Single iteration only
@@ -1020,66 +1020,66 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x16, #0x40]\n"
+ "ldr q21, [x15, #0x40]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x16, #0x50]\n"
+ "ldr q20, [x15, #0x50]\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x16, #0x60]\n"
+ "ldr q21, [x15, #0x60]\n"
".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x16, #0x70]\n"
+ "ldr q20, [x15, #0x70]\n"
".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x16, #0x80]\n"
+ "ldr q21, [x15, #0x80]\n"
".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x16, #0x90]\n"
+ "ldr q20, [x15, #0x90]\n"
".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x16, #0xa0]\n"
+ "ldr q21, [x15, #0xa0]\n"
".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x16, #0xb0]\n"
+ "ldr q20, [x15, #0xb0]\n"
".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x16, #0xc0]\n"
+ "ldr q21, [x15, #0xc0]\n"
".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x16, #0xd0]\n"
+ "ldr q20, [x15, #0xd0]\n"
".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x16, #0xe0]\n"
+ "ldr q21, [x15, #0xe0]\n"
".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x16, #0xf0]\n"
+ "ldr q20, [x15, #0xf0]\n"
".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
- "add x16, x16, #0x100\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
@@ -1095,18 +1095,18 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr s23, [x11], #0x4\n"
"cmp x13, #0x4\n"
"ldr s22, [x10], #0x4\n"
- "ldr q21, [x16, #0x0]\n"
- "ldr q20, [x16, #0x10]\n"
+ "ldr q21, [x15, #0x0]\n"
".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
+ "ldr q20, [x15, #0x10]\n"
".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
- "ldr q21, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
- "ldr q20, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n"
@@ -1129,18 +1129,18 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr b1, [x11, #0x0]\n"
"ldr b2, [x10, #0x0]\n"
"67:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q21, [x16, #0x0]\n"
- "ldr q20, [x16, #0x10]\n"
+ "ldr q21, [x15, #0x0]\n"
".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x15, #0x10]\n"
".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
@@ -1152,21 +1152,21 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x14, x20\n"
"bne 58b\n"
"ldr q23, [x6, #0x0]\n"
- "ldr q22, [x6, #0x10]\n"
- "ldr q21, [x6, #0x20]\n"
- "ldr q20, [x6, #0x30]\n"
"add v8.4s, v8.4s, v23.4s\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr q22, [x6, #0x10]\n"
"add v9.4s, v9.4s, v22.4s\n"
- "prfm pstl1keep, [x15, #0x0]\n"
+ "ldr q21, [x6, #0x20]\n"
"add v10.4s, v10.4s, v21.4s\n"
+ "ldr q20, [x6, #0x30]\n"
"add v11.4s, v11.4s, v20.4s\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x17, x20\n"
+ "add x24, x25, x20\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"add v12.4s, v12.4s, v23.4s\n"
- "add x26, x15, x20\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"add v13.4s, v13.4s, v22.4s\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add v14.4s, v14.4s, v21.4s\n"
"add v15.4s, v15.4s, v20.4s\n"
"add v16.4s, v16.4s, v23.4s\n"
@@ -1187,9 +1187,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 70f\n"
"69:" // Height 3: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -1261,36 +1261,35 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v18.4s, v18.4s, v2.4s\n"
"srshl v19.4s, v19.4s, v3.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x20]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v20.4s\n"
+ "add v9.4s, v9.4s, v20.4s\n"
+ "add v10.4s, v10.4s, v20.4s\n"
+ "add v11.4s, v11.4s, v20.4s\n"
+ "add v12.4s, v12.4s, v20.4s\n"
+ "add v13.4s, v13.4s, v20.4s\n"
+ "add v14.4s, v14.4s, v20.4s\n"
+ "add v15.4s, v15.4s, v20.4s\n"
+ "add v16.4s, v16.4s, v20.4s\n"
+ "add v17.4s, v17.4s, v20.4s\n"
+ "add v18.4s, v18.4s, v20.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v20.4s\n"
+ "smin v9.4s, v9.4s, v20.4s\n"
+ "smin v10.4s, v10.4s, v20.4s\n"
+ "smin v11.4s, v11.4s, v20.4s\n"
+ "smin v12.4s, v12.4s, v20.4s\n"
+ "smin v13.4s, v13.4s, v20.4s\n"
+ "smin v14.4s, v14.4s, v20.4s\n"
+ "smin v15.4s, v15.4s, v20.4s\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v21.4s }, [x21]\n"
- "cmp x17, #0x10\n"
"ld1r { v20.4s }, [x20]\n"
- "add v8.4s, v8.4s, v22.4s\n"
- "add v9.4s, v9.4s, v22.4s\n"
- "add v10.4s, v10.4s, v22.4s\n"
- "add v11.4s, v11.4s, v22.4s\n"
- "add v12.4s, v12.4s, v22.4s\n"
- "add v13.4s, v13.4s, v22.4s\n"
- "add v14.4s, v14.4s, v22.4s\n"
- "add v15.4s, v15.4s, v22.4s\n"
- "add v16.4s, v16.4s, v22.4s\n"
- "add v17.4s, v17.4s, v22.4s\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v22.4s\n"
- "smin v8.4s, v8.4s, v21.4s\n"
- "smin v9.4s, v9.4s, v21.4s\n"
- "smin v10.4s, v10.4s, v21.4s\n"
- "smin v11.4s, v11.4s, v21.4s\n"
- "smin v12.4s, v12.4s, v21.4s\n"
- "smin v13.4s, v13.4s, v21.4s\n"
- "smin v14.4s, v14.4s, v21.4s\n"
- "smin v15.4s, v15.4s, v21.4s\n"
- "smin v16.4s, v16.4s, v21.4s\n"
- "smin v17.4s, v17.4s, v21.4s\n"
- "smin v18.4s, v18.4s, v21.4s\n"
- "smin v19.4s, v19.4s, v21.4s\n"
"smax v8.4s, v8.4s, v20.4s\n"
"smax v9.4s, v9.4s, v20.4s\n"
"smax v10.4s, v10.4s, v20.4s\n"
@@ -1309,101 +1308,102 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"uzp1 v20.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
+ "cmp x16, #0x10\n"
"uzp1 v8.16b, v8.16b, v21.16b\n"
"uzp1 v12.16b, v12.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 80f\n"
- "tbz x17, #3, 75f\n"
- "str d8, [x15], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "tbz x17, #2, 73f\n"
- "st1 { v8.s }[2], [x15], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "tbz x17, #1, 72f\n"
- "st1 { v8.h }[6], [x15], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[14], [x15]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
+ "tbz x16, #3, 75f\n"
+ "str d8, [x17], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "tbz x16, #2, 73f\n"
+ "st1 { v8.s }[2], [x17], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "tbz x16, #1, 72f\n"
+ "st1 { v8.h }[6], [x17], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[14], [x17]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
"b 79f\n"
"72:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[12], [x15]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[12], [x17]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
"b 79f\n"
"73:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x17, #1, 74f\n"
- "st1 { v8.h }[4], [x15], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[10], [x15]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
+ "tbz x16, #1, 74f\n"
+ "st1 { v8.h }[4], [x17], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[10], [x17]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
"b 79f\n"
"74:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[8], [x15]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[8], [x17]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
"b 79f\n"
"75:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x17, #2, 77f\n"
- "str s8, [x15], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "tbz x17, #1, 76f\n"
- "st1 { v8.h }[2], [x15], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[6], [x15]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
+ "tbz x16, #2, 77f\n"
+ "str s8, [x17], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "tbz x16, #1, 76f\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[6], [x17]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
"b 79f\n"
"76:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[4], [x15]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[4], [x17]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
"b 79f\n"
"77:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x17, #1, 78f\n"
- "str h8, [x15], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "tbz x17, #0, 79f\n"
- "st1 { v8.b }[2], [x15]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
+ "tbz x16, #1, 78f\n"
+ "str h8, [x17], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "tbz x16, #0, 79f\n"
+ "st1 { v8.b }[2], [x17]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
"b 79f\n"
"78:" // Height 3: Partial direct writeback: partial_1_0
- "str b8, [x15, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
+ "str b8, [x17, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
"79:" // Height 3: Partial direct writeback: Done
"b 81f\n"
"80:" // Height 3: Full writeback
- "str q8, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
+ "str q8, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
"81:" // Height 3: Writeback done
- "subs x17, x17, #0x10\n"
+ "subs x16, x16, #0x10\n"
"bgt 56b\n"
"b 164f\n"
"82:" // Height 4
"mov x6, %x[col_bias]\n"
"ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"83:" // Height 4: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -1425,8 +1425,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x14, #0x0\n"
"85:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 86f\n"
"ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1454,129 +1454,130 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr q1, [x11, #0x0]\n"
"ldr q2, [x10, #0x0]\n"
"ldr q3, [x9, #0x0]\n"
- "ldr q6, [x16, #0x0]\n"
- "ldr q7, [x16, #0x10]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "ldr q7, [x15, #0x10]\n"
"blt 89f\n"
"88:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x16, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x21, [x16, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr d25, [x16, #0x20]\n"
+ "ldr d25, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x11, x11, #0x10\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr d24, [x16, #0x30]\n"
+ "ldr d24, [x15, #0x30]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
- "ldr x20, [x16, #0x48]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0x58]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr d25, [x16, #0x40]\n"
+ "ldr d25, [x15, #0x40]\n"
".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
- "ldr x21, [x16, #0x58]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
- "ldr x25, [x12, #0x8]\n"
+ "ldr x21, [x15, #0x68]\n"
".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr d24, [x16, #0x50]\n"
+ "ldr d24, [x15, #0x50]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
- "ldr x20, [x16, #0x68]\n"
".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
- "ldr x24, [x11, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0x78]\n"
".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ "ldr x25, [x12, #0x8]\n"
".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr d25, [x16, #0x60]\n"
+ "ldr d25, [x15, #0x60]\n"
".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
- "ldr x21, [x16, #0x78]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
- "ldr x23, [x10, #0x8]\n"
+ "ldr x21, [x15, #0x88]\n"
".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x24, [x11, #0x8]\n"
".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr d24, [x16, #0x70]\n"
+ "ldr d24, [x15, #0x70]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
- "ldr x20, [x16, #0x88]\n"
".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
- "ldr x22, [x9, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0x98]\n"
".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ "ldr x23, [x10, #0x8]\n"
".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr d25, [x16, #0x80]\n"
+ "ldr d25, [x15, #0x80]\n"
".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
- "ldr x21, [x16, #0x98]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
- "sub x13, x13, #0x10\n"
+ "ldr x21, [x15, #0xa8]\n"
".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x22, [x9, #0x8]\n"
".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr d24, [x16, #0x90]\n"
+ "ldr d24, [x15, #0x90]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
- "ldr x20, [x16, #0xa8]\n"
".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
- "cmp x13, #0x20\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0xb8]\n"
".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ "sub x13, x13, #0x10\n"
".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr d25, [x16, #0xa0]\n"
+ "ldr d25, [x15, #0xa0]\n"
".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
- "ldr x21, [x16, #0xb8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x21, [x15, #0xc8]\n"
".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
- "mov v25.d[1], x20\n"
+ "cmp x13, #0x20\n"
".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr d24, [x16, #0xb0]\n"
+ "ldr d24, [x15, #0xb0]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
- "ldr x20, [x16, #0xc8]\n"
".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0xd8]\n"
".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr d25, [x16, #0xc0]\n"
+ "ldr d25, [x15, #0xc0]\n"
".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
- "ldr x21, [x16, #0xd8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr x21, [x15, #0xe8]\n"
".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
- "mov v25.d[1], x20\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr d24, [x16, #0xd0]\n"
+ "ldr d24, [x15, #0xd0]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
- "ldr x20, [x16, #0xe8]\n"
".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0xf8]\n"
".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr d25, [x16, #0xe0]\n"
+ "ldr d25, [x15, #0xe0]\n"
".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
- "ldr x21, [x16, #0xf8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
- "mov v25.d[1], x20\n"
".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr d24, [x16, #0xf0]\n"
- "add x16, x16, #0x100\n"
+ "ldr d24, [x15, #0xf0]\n"
+ "mov v24.d[1], x20\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0x8]\n"
".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
- "ldr x20, [x16, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x15, #0x18]\n"
".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
- "ldr d6, [x16, #0x0]\n"
+ "ldr d6, [x15, #0x0]\n"
".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
@@ -1585,9 +1586,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr d2, [x10, #0x0]\n"
".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"ldr d3, [x9, #0x0]\n"
- "ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x16, #0x18]\n"
+ "ldr d7, [x15, #0x10]\n"
+ "mov v6.d[1], x21\n"
"mov v0.d[1], x25\n"
"mov v1.d[1], x24\n"
"mov v2.d[1], x23\n"
@@ -1602,7 +1602,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x9, x9, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1610,7 +1610,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
@@ -1618,64 +1618,64 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
"prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x16, #0x40]\n"
+ "ldr q25, [x15, #0x40]\n"
".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x16, #0x50]\n"
+ "ldr q24, [x15, #0x50]\n"
".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x16, #0x60]\n"
+ "ldr q25, [x15, #0x60]\n"
".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x16, #0x70]\n"
+ "ldr q24, [x15, #0x70]\n"
".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x16, #0x80]\n"
+ "ldr q25, [x15, #0x80]\n"
".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x16, #0x90]\n"
+ "ldr q24, [x15, #0x90]\n"
".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x16, #0xa0]\n"
+ "ldr q25, [x15, #0xa0]\n"
".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x16, #0xb0]\n"
+ "ldr q24, [x15, #0xb0]\n"
".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x16, #0xc0]\n"
+ "ldr q25, [x15, #0xc0]\n"
".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x16, #0xd0]\n"
+ "ldr q24, [x15, #0xd0]\n"
".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x16, #0xe0]\n"
+ "ldr q25, [x15, #0xe0]\n"
".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x16, #0xf0]\n"
+ "ldr q24, [x15, #0xf0]\n"
".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
- "add x16, x16, #0x100\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
@@ -1694,20 +1694,20 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x13, #0x4\n"
"ldr s27, [x10], #0x4\n"
"ldr s26, [x9], #0x4\n"
- "ldr q25, [x16, #0x0]\n"
- "ldr q24, [x16, #0x10]\n"
+ "ldr q25, [x15, #0x0]\n"
".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
+ "ldr q24, [x15, #0x10]\n"
".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
- "ldr q25, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
- "ldr q24, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n"
@@ -1735,20 +1735,20 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr b2, [x10, #0x0]\n"
"ldr b3, [x9, #0x0]\n"
"94:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q25, [x16, #0x0]\n"
- "ldr q24, [x16, #0x10]\n"
+ "ldr q25, [x15, #0x0]\n"
".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
+ "ldr q24, [x15, #0x10]\n"
".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
@@ -1762,24 +1762,24 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x14, x20\n"
"bne 85b\n"
"ldr q27, [x6, #0x0]\n"
- "ldr q26, [x6, #0x10]\n"
- "ldr q25, [x6, #0x20]\n"
- "ldr q24, [x6, #0x30]\n"
"add v8.4s, v8.4s, v27.4s\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr q26, [x6, #0x10]\n"
"add v9.4s, v9.4s, v26.4s\n"
- "prfm pstl1keep, [x15, #0x0]\n"
+ "ldr q25, [x6, #0x20]\n"
"add v10.4s, v10.4s, v25.4s\n"
+ "ldr q24, [x6, #0x30]\n"
"add v11.4s, v11.4s, v24.4s\n"
- "add v12.4s, v12.4s, v27.4s\n"
- "add x26, x15, x20\n"
- "add v13.4s, v13.4s, v26.4s\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x17, x20\n"
"add x24, x25, x20\n"
+ "add x23, x24, x20\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "add v12.4s, v12.4s, v27.4s\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "add v13.4s, v13.4s, v26.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
"add v14.4s, v14.4s, v25.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"add v15.4s, v15.4s, v24.4s\n"
"add v16.4s, v16.4s, v27.4s\n"
"add v17.4s, v17.4s, v26.4s\n"
@@ -1803,9 +1803,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 97f\n"
"96:" // Height 4: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -1897,44 +1897,43 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v22.4s, v22.4s, v2.4s\n"
"srshl v23.4s, v23.4s, v3.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x20]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v24.4s\n"
+ "add v9.4s, v9.4s, v24.4s\n"
+ "add v10.4s, v10.4s, v24.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
+ "add v12.4s, v12.4s, v24.4s\n"
+ "add v13.4s, v13.4s, v24.4s\n"
+ "add v14.4s, v14.4s, v24.4s\n"
+ "add v15.4s, v15.4s, v24.4s\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v24.4s\n"
+ "add v18.4s, v18.4s, v24.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v24.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v24.4s\n"
+ "smin v9.4s, v9.4s, v24.4s\n"
+ "smin v10.4s, v10.4s, v24.4s\n"
+ "smin v11.4s, v11.4s, v24.4s\n"
+ "smin v12.4s, v12.4s, v24.4s\n"
+ "smin v13.4s, v13.4s, v24.4s\n"
+ "smin v14.4s, v14.4s, v24.4s\n"
+ "smin v15.4s, v15.4s, v24.4s\n"
+ "smin v16.4s, v16.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v24.4s\n"
+ "smin v18.4s, v18.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v24.4s\n"
+ "smin v22.4s, v22.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v24.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v25.4s }, [x21]\n"
- "cmp x17, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
- "add v8.4s, v8.4s, v26.4s\n"
- "add v9.4s, v9.4s, v26.4s\n"
- "add v10.4s, v10.4s, v26.4s\n"
- "add v11.4s, v11.4s, v26.4s\n"
- "add v12.4s, v12.4s, v26.4s\n"
- "add v13.4s, v13.4s, v26.4s\n"
- "add v14.4s, v14.4s, v26.4s\n"
- "add v15.4s, v15.4s, v26.4s\n"
- "add v16.4s, v16.4s, v26.4s\n"
- "add v17.4s, v17.4s, v26.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v26.4s\n"
- "add v20.4s, v20.4s, v26.4s\n"
- "add v21.4s, v21.4s, v26.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v26.4s\n"
- "smin v8.4s, v8.4s, v25.4s\n"
- "smin v9.4s, v9.4s, v25.4s\n"
- "smin v10.4s, v10.4s, v25.4s\n"
- "smin v11.4s, v11.4s, v25.4s\n"
- "smin v12.4s, v12.4s, v25.4s\n"
- "smin v13.4s, v13.4s, v25.4s\n"
- "smin v14.4s, v14.4s, v25.4s\n"
- "smin v15.4s, v15.4s, v25.4s\n"
- "smin v16.4s, v16.4s, v25.4s\n"
- "smin v17.4s, v17.4s, v25.4s\n"
- "smin v18.4s, v18.4s, v25.4s\n"
- "smin v19.4s, v19.4s, v25.4s\n"
- "smin v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v25.4s\n"
- "smin v22.4s, v22.4s, v25.4s\n"
- "smin v23.4s, v23.4s, v25.4s\n"
"smax v8.4s, v8.4s, v24.4s\n"
"smax v9.4s, v9.4s, v24.4s\n"
"smax v10.4s, v10.4s, v24.4s\n"
@@ -1959,118 +1958,119 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
"uzp1 v17.8h, v22.8h, v23.8h\n"
+ "cmp x16, #0x10\n"
"uzp1 v8.16b, v8.16b, v25.16b\n"
"uzp1 v12.16b, v12.16b, v24.16b\n"
"uzp1 v16.16b, v16.16b, v18.16b\n"
"uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 107f\n"
- "tbz x17, #3, 102f\n"
- "str d8, [x15], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "tbz x17, #2, 100f\n"
- "st1 { v8.s }[2], [x15], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "tbz x17, #1, 99f\n"
- "st1 { v8.h }[6], [x15], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[14], [x15]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
- "st1 { v20.b }[14], [x24]\n"
+ "tbz x16, #3, 102f\n"
+ "str d8, [x17], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x16, #2, 100f\n"
+ "st1 { v8.s }[2], [x17], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x16, #1, 99f\n"
+ "st1 { v8.h }[6], [x17], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[14], [x17]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 106f\n"
"99:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[12], [x15]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
- "st1 { v20.b }[12], [x24]\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[12], [x17]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 106f\n"
"100:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x17, #1, 101f\n"
- "st1 { v8.h }[4], [x15], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[10], [x15]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
- "st1 { v20.b }[10], [x24]\n"
+ "tbz x16, #1, 101f\n"
+ "st1 { v8.h }[4], [x17], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[10], [x17]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 106f\n"
"101:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[8], [x15]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
- "st1 { v20.b }[8], [x24]\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[8], [x17]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 106f\n"
"102:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x17, #2, 104f\n"
- "str s8, [x15], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "tbz x17, #1, 103f\n"
- "st1 { v8.h }[2], [x15], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[6], [x15]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
- "st1 { v20.b }[6], [x24]\n"
+ "tbz x16, #2, 104f\n"
+ "str s8, [x17], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x16, #1, 103f\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[6], [x17]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 106f\n"
"103:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[4], [x15]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
- "st1 { v20.b }[4], [x24]\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[4], [x17]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 106f\n"
"104:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x17, #1, 105f\n"
- "str h8, [x15], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "str h20, [x24], #0x2\n"
- "tbz x17, #0, 106f\n"
- "st1 { v8.b }[2], [x15]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
- "st1 { v20.b }[2], [x24]\n"
+ "tbz x16, #1, 105f\n"
+ "str h8, [x17], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "tbz x16, #0, 106f\n"
+ "st1 { v8.b }[2], [x17]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 106f\n"
"105:" // Height 4: Partial direct writeback: partial_1_0
- "str b8, [x15, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
- "str b20, [x24, #0x0]\n"
+ "str b8, [x17, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"106:" // Height 4: Partial direct writeback: Done
"b 108f\n"
"107:" // Height 4: Full writeback
- "str q8, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
- "str q20, [x24, #0x0]\n"
+ "str q8, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"108:" // Height 4: Writeback done
- "subs x17, x17, #0x10\n"
+ "subs x16, x16, #0x10\n"
"bgt 83b\n"
"b 164f\n"
"109:" // Height 5
"mov x6, %x[col_bias]\n"
"ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"110:" // Height 5: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -2096,8 +2096,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x14, #0x0\n"
"112:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 113f\n"
"ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2129,148 +2129,148 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr q2, [x10, #0x0]\n"
"ldr q3, [x9, #0x0]\n"
"ldr q4, [x28, #0x0]\n"
- "ldr q6, [x16, #0x0]\n"
- "ldr q7, [x16, #0x10]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "ldr q7, [x15, #0x10]\n"
"blt 116f\n"
"115:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x16, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr d29, [x16, #0x20]\n"
+ "ldr d29, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v29.d[1], x21\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x16, #0x48]\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr d28, [x16, #0x30]\n"
+ "ldr d28, [x15, #0x30]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
- "ldr x26, [x12, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0x58]\n"
".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
- "ldr x20, [x16, #0x58]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr d29, [x16, #0x40]\n"
+ "ldr d29, [x15, #0x40]\n"
".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
- "ldr x25, [x11, #0x8]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
- "ldr x24, [x10, #0x8]\n"
+ "ldr x21, [x15, #0x68]\n"
".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
- "mov v29.d[1], x21\n"
+ "ldr x25, [x11, #0x8]\n"
".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
- "ldr x21, [x16, #0x68]\n"
+ "ldr x24, [x10, #0x8]\n"
".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr d28, [x16, #0x50]\n"
+ "ldr d28, [x15, #0x50]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
- "ldr x23, [x9, #0x8]\n"
".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
- "ldr x22, [x28, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0x78]\n"
".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
- "ldr x20, [x16, #0x78]\n"
+ "ldr x22, [x28, #0x8]\n"
".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr d29, [x16, #0x60]\n"
+ "ldr d29, [x15, #0x60]\n"
".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
- "sub x13, x13, #0x10\n"
+ "mov v29.d[1], x21\n"
".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
- "cmp x13, #0x20\n"
+ "ldr x21, [x15, #0x88]\n"
".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
- "mov v29.d[1], x21\n"
+ "sub x13, x13, #0x10\n"
".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
- "ldr x21, [x16, #0x88]\n"
+ "cmp x13, #0x20\n"
".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr d28, [x16, #0x70]\n"
+ "ldr d28, [x15, #0x70]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
- "ldr x20, [x16, #0x98]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr d29, [x16, #0x80]\n"
+ "ldr d29, [x15, #0x80]\n"
".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x21, [x15, #0xa8]\n"
".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
- "mov v29.d[1], x21\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
- "ldr x21, [x16, #0xa8]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr d28, [x16, #0x90]\n"
+ "ldr d28, [x15, #0x90]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0xb8]\n"
".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
- "ldr x20, [x16, #0xb8]\n"
".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr d29, [x16, #0xa0]\n"
+ "ldr d29, [x15, #0xa0]\n"
".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xc8]\n"
".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
- "mov v29.d[1], x21\n"
".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
- "ldr x21, [x16, #0xc8]\n"
".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr d28, [x16, #0xb0]\n"
+ "ldr d28, [x15, #0xb0]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
- "ldr x20, [x16, #0xd8]\n"
".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr d29, [x16, #0xc0]\n"
+ "ldr d29, [x15, #0xc0]\n"
".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xe8]\n"
".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
- "mov v29.d[1], x21\n"
".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
- "ldr x21, [x16, #0xe8]\n"
".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr d28, [x16, #0xd0]\n"
+ "ldr d28, [x15, #0xd0]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
- "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr d29, [x16, #0xe0]\n"
+ "ldr d29, [x15, #0xe0]\n"
".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
- "mov v29.d[1], x21\n"
".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr d28, [x16, #0xf0]\n"
- "add x16, x16, #0x100\n"
+ "ldr d28, [x15, #0xf0]\n"
+ "mov v28.d[1], x20\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0x8]\n"
".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
- "ldr x21, [x16, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x15, #0x18]\n"
".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
- "ldr x20, [x16, #0x18]\n"
".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
- "ldr d6, [x16, #0x0]\n"
+ "ldr d6, [x15, #0x0]\n"
".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
@@ -2281,7 +2281,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr d3, [x9, #0x0]\n"
".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"ldr d4, [x28, #0x0]\n"
- "ldr d7, [x16, #0x10]\n"
+ "ldr d7, [x15, #0x10]\n"
"mov v6.d[1], x21\n"
"mov v0.d[1], x26\n"
"mov v1.d[1], x25\n"
@@ -2300,7 +2300,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
"add x9, x9, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x28, x28, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2310,7 +2310,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
@@ -2319,75 +2319,75 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x16, #0x40]\n"
+ "ldr q29, [x15, #0x40]\n"
".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x16, #0x50]\n"
+ "ldr q28, [x15, #0x50]\n"
".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x16, #0x60]\n"
+ "ldr q29, [x15, #0x60]\n"
".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x16, #0x70]\n"
+ "ldr q28, [x15, #0x70]\n"
".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x16, #0x80]\n"
+ "ldr q29, [x15, #0x80]\n"
".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x16, #0x90]\n"
+ "ldr q28, [x15, #0x90]\n"
".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x16, #0xa0]\n"
+ "ldr q29, [x15, #0xa0]\n"
".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x16, #0xb0]\n"
+ "ldr q28, [x15, #0xb0]\n"
".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x16, #0xc0]\n"
+ "ldr q29, [x15, #0xc0]\n"
".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x16, #0xd0]\n"
+ "ldr q28, [x15, #0xd0]\n"
".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x16, #0xe0]\n"
+ "ldr q29, [x15, #0xe0]\n"
".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x16, #0xf0]\n"
+ "ldr q28, [x15, #0xf0]\n"
".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
- "add x16, x16, #0x100\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
@@ -2409,22 +2409,22 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr s0, [x10], #0x4\n"
"ldr s31, [x9], #0x4\n"
"ldr s30, [x28], #0x4\n"
- "ldr q29, [x16, #0x0]\n"
- "ldr q28, [x16, #0x10]\n"
+ "ldr q29, [x15, #0x0]\n"
".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
+ "ldr q28, [x15, #0x10]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
- "ldr q29, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
- "ldr q28, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n"
@@ -2457,22 +2457,22 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr b3, [x9, #0x0]\n"
"ldr b4, [x28, #0x0]\n"
"121:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q29, [x16, #0x0]\n"
- "ldr q28, [x16, #0x10]\n"
+ "ldr q29, [x15, #0x0]\n"
".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
+ "ldr q28, [x15, #0x10]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
@@ -2488,26 +2488,26 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x14, x20\n"
"bne 112b\n"
"ldr q31, [x6, #0x0]\n"
- "ldr q30, [x6, #0x10]\n"
- "ldr q29, [x6, #0x20]\n"
- "ldr q28, [x6, #0x30]\n"
"add v8.4s, v8.4s, v31.4s\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr q30, [x6, #0x10]\n"
"add v9.4s, v9.4s, v30.4s\n"
- "prfm pstl1keep, [x15, #0x0]\n"
+ "ldr q29, [x6, #0x20]\n"
"add v10.4s, v10.4s, v29.4s\n"
+ "ldr q28, [x6, #0x30]\n"
"add v11.4s, v11.4s, v28.4s\n"
- "add v12.4s, v12.4s, v31.4s\n"
- "add x26, x15, x20\n"
- "add v13.4s, v13.4s, v30.4s\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x17, x20\n"
"add x24, x25, x20\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add x23, x24, x20\n"
+ "add x22, x23, x20\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v12.4s, v12.4s, v31.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "add v13.4s, v13.4s, v30.4s\n"
"prfm pstl1keep, [x23, #0x0]\n"
"add v14.4s, v14.4s, v29.4s\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"add v15.4s, v15.4s, v28.4s\n"
"add v16.4s, v16.4s, v31.4s\n"
"add v17.4s, v17.4s, v30.4s\n"
@@ -2535,9 +2535,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 124f\n"
"123:" // Height 5: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -2649,52 +2649,51 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v26.4s, v26.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x20]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v28.4s\n"
+ "add v9.4s, v9.4s, v28.4s\n"
+ "add v10.4s, v10.4s, v28.4s\n"
+ "add v11.4s, v11.4s, v28.4s\n"
+ "add v12.4s, v12.4s, v28.4s\n"
+ "add v13.4s, v13.4s, v28.4s\n"
+ "add v14.4s, v14.4s, v28.4s\n"
+ "add v15.4s, v15.4s, v28.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v28.4s\n"
+ "add v18.4s, v18.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v28.4s\n"
+ "smin v9.4s, v9.4s, v28.4s\n"
+ "smin v10.4s, v10.4s, v28.4s\n"
+ "smin v11.4s, v11.4s, v28.4s\n"
+ "smin v12.4s, v12.4s, v28.4s\n"
+ "smin v13.4s, v13.4s, v28.4s\n"
+ "smin v14.4s, v14.4s, v28.4s\n"
+ "smin v15.4s, v15.4s, v28.4s\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v23.4s, v23.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "cmp x17, #0x10\n"
"ld1r { v28.4s }, [x20]\n"
- "add v8.4s, v8.4s, v30.4s\n"
- "add v9.4s, v9.4s, v30.4s\n"
- "add v10.4s, v10.4s, v30.4s\n"
- "add v11.4s, v11.4s, v30.4s\n"
- "add v12.4s, v12.4s, v30.4s\n"
- "add v13.4s, v13.4s, v30.4s\n"
- "add v14.4s, v14.4s, v30.4s\n"
- "add v15.4s, v15.4s, v30.4s\n"
- "add v16.4s, v16.4s, v30.4s\n"
- "add v17.4s, v17.4s, v30.4s\n"
- "add v18.4s, v18.4s, v30.4s\n"
- "add v19.4s, v19.4s, v30.4s\n"
- "add v20.4s, v20.4s, v30.4s\n"
- "add v21.4s, v21.4s, v30.4s\n"
- "add v22.4s, v22.4s, v30.4s\n"
- "add v23.4s, v23.4s, v30.4s\n"
- "add v24.4s, v24.4s, v30.4s\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "add v27.4s, v27.4s, v30.4s\n"
- "smin v8.4s, v8.4s, v29.4s\n"
- "smin v9.4s, v9.4s, v29.4s\n"
- "smin v10.4s, v10.4s, v29.4s\n"
- "smin v11.4s, v11.4s, v29.4s\n"
- "smin v12.4s, v12.4s, v29.4s\n"
- "smin v13.4s, v13.4s, v29.4s\n"
- "smin v14.4s, v14.4s, v29.4s\n"
- "smin v15.4s, v15.4s, v29.4s\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v19.4s, v19.4s, v29.4s\n"
- "smin v20.4s, v20.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v29.4s\n"
- "smin v22.4s, v22.4s, v29.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v24.4s, v24.4s, v29.4s\n"
- "smin v25.4s, v25.4s, v29.4s\n"
- "smin v26.4s, v26.4s, v29.4s\n"
- "smin v27.4s, v27.4s, v29.4s\n"
"smax v8.4s, v8.4s, v28.4s\n"
"smax v9.4s, v9.4s, v28.4s\n"
"smax v10.4s, v10.4s, v28.4s\n"
@@ -2725,139 +2724,139 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
"uzp1 v17.8h, v26.8h, v27.8h\n"
+ "cmp x16, #0x10\n"
"uzp1 v8.16b, v8.16b, v29.16b\n"
"uzp1 v12.16b, v12.16b, v28.16b\n"
"uzp1 v16.16b, v16.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v18.16b\n"
"uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 134f\n"
- "tbz x17, #3, 129f\n"
- "str d8, [x15], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x17, #2, 127f\n"
- "st1 { v8.s }[2], [x15], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x17, #1, 126f\n"
- "st1 { v8.h }[6], [x15], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[14], [x15]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "tbz x16, #3, 129f\n"
+ "str d8, [x17], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x16, #2, 127f\n"
+ "st1 { v8.s }[2], [x17], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x16, #1, 126f\n"
+ "st1 { v8.h }[6], [x17], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[14], [x17]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 133f\n"
"126:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[12], [x15]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[12], [x17]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 133f\n"
"127:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x17, #1, 128f\n"
- "st1 { v8.h }[4], [x15], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[10], [x15]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "tbz x16, #1, 128f\n"
+ "st1 { v8.h }[4], [x17], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[10], [x17]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 133f\n"
"128:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[8], [x15]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[8], [x17]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 133f\n"
"129:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x17, #2, 131f\n"
- "str s8, [x15], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x17, #1, 130f\n"
- "st1 { v8.h }[2], [x15], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[6], [x15]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "tbz x16, #2, 131f\n"
+ "str s8, [x17], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x16, #1, 130f\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[6], [x17]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 133f\n"
"130:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[4], [x15]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[4], [x17]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 133f\n"
"131:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x17, #1, 132f\n"
- "str h8, [x15], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x17, #0, 133f\n"
- "st1 { v8.b }[2], [x15]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "tbz x16, #1, 132f\n"
+ "str h8, [x17], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x16, #0, 133f\n"
+ "st1 { v8.b }[2], [x17]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 133f\n"
"132:" // Height 5: Partial direct writeback: partial_1_0
- "str b8, [x15, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b8, [x17, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"133:" // Height 5: Partial direct writeback: Done
"b 135f\n"
"134:" // Height 5: Full writeback
- "str q8, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q8, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"135:" // Height 5: Writeback done
- "subs x17, x17, #0x10\n"
+ "subs x16, x16, #0x10\n"
"bgt 110b\n"
"b 164f\n"
"136:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x6\n"
- "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x6, %x[col_bias]\n"
"ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "madd x20, x21, x20, x15\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"137:" // Height 6: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -2887,8 +2886,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"mov x14, #0x0\n"
"139:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 140f\n"
"ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2924,14 +2923,14 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr q3, [x9, #0x0]\n"
"ldr q4, [x28, #0x0]\n"
"ldr q5, [x27, #0x0]\n"
- "ldr q6, [x16, #0x0]\n"
- "ldr q7, [x16, #0x10]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "ldr q7, [x15, #0x10]\n"
"blt 143f\n"
"142:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x16, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
@@ -2939,150 +2938,149 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d6, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x21\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x16, #0x48]\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
"add x27, x27, #0x10\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
+ "ldr d7, [x15, #0x30]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr x26, [x12, #0x8]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x25, [x11, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0x58]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x20, [x16, #0x58]\n"
+ "ldr x25, [x11, #0x8]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
"ldr x24, [x10, #0x8]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
+ "ldr d6, [x15, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr x23, [x9, #0x8]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x22, [x28, #0x8]\n"
+ "ldr x21, [x15, #0x68]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x21\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x16, #0x68]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
"sub x13, x13, #0x10\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "cmp x13, #0x20\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
+ "ldr d7, [x15, #0x50]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "cmp x13, #0x20\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0x78]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr x20, [x16, #0x78]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
"prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
+ "ldr d6, [x15, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x21, [x15, #0x88]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "mov v6.d[1], x21\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr x21, [x16, #0x88]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
"prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
+ "ldr d7, [x15, #0x70]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr x20, [x16, #0x98]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
+ "ldr d6, [x15, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0xa8]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "mov v6.d[1], x21\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr x21, [x16, #0xa8]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
+ "ldr d7, [x15, #0x90]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0xb8]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr x20, [x16, #0xb8]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
+ "ldr d6, [x15, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xc8]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "mov v6.d[1], x21\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr x21, [x16, #0xc8]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
+ "ldr d7, [x15, #0xb0]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr x20, [x16, #0xd8]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
+ "ldr d6, [x15, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xe8]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "mov v6.d[1], x21\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr x21, [x16, #0xe8]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
+ "ldr d7, [x15, #0xd0]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
+ "ldr d6, [x15, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr x22, [x28, #0x8]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "mov v6.d[1], x21\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr x21, [x27, #0x8]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "add x16, x16, #0x100\n"
+ "ldr d7, [x15, #0xf0]\n"
+ "mov v7.d[1], x20\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr x20, [x15, #0x8]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "mov v7.d[1], x20\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- "ldr x20, [x16, #0x8]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
- "ldr d6, [x16, #0x0]\n"
+ "ldr d6, [x15, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
@@ -3095,10 +3093,11 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr d4, [x28, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
"ldr d5, [x27, #0x0]\n"
- "ldr d7, [x16, #0x10]\n"
+ "ldr d7, [x15, #0x10]\n"
"mov v6.d[1], x20\n"
- "ldr x20, [x16, #0x18]\n"
+ "ldr x21, [x27, #0x8]\n"
"mov v0.d[1], x26\n"
+ "ldr x20, [x15, #0x18]\n"
"mov v1.d[1], x25\n"
"mov v2.d[1], x24\n"
"mov v3.d[1], x23\n"
@@ -3118,7 +3117,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
"add x28, x28, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q6, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x27, x27, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -3130,7 +3129,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
+ "ldr q7, [x15, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
"prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
@@ -3140,86 +3139,86 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
+ "ldr q6, [x15, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
+ "ldr q7, [x15, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
+ "ldr q6, [x15, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
+ "ldr q7, [x15, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
+ "ldr q6, [x15, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
+ "ldr q7, [x15, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
+ "ldr q6, [x15, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
+ "ldr q7, [x15, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
+ "ldr q6, [x15, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
+ "ldr q7, [x15, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
+ "ldr q6, [x15, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
+ "ldr q7, [x15, #0xf0]\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x16, x16, #0x100\n"
+ "add x15, x15, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
@@ -3244,24 +3243,24 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr s4, [x9], #0x4\n"
"ldr s3, [x28], #0x4\n"
"ldr s2, [x27], #0x4\n"
- "ldr q1, [x16, #0x0]\n"
- "ldr q0, [x16, #0x10]\n"
+ "ldr q1, [x15, #0x0]\n"
".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
+ "ldr q0, [x15, #0x10]\n"
".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
- "ldr q1, [x16, #0x20]\n"
+ "ldr q1, [x15, #0x20]\n"
".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
- "ldr q0, [x16, #0x30]\n"
+ "ldr q0, [x15, #0x30]\n"
".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n"
@@ -3299,24 +3298,24 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"ldr b4, [x28, #0x0]\n"
"ldr b5, [x27, #0x0]\n"
"148:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x16, #0x0]\n"
- "ldr q6, [x16, #0x10]\n"
+ "ldr q7, [x15, #0x0]\n"
".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x15, #0x10]\n"
".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x20]\n"
+ "ldr q7, [x15, #0x20]\n"
".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x30]\n"
+ "ldr q6, [x15, #0x30]\n"
".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n"
@@ -3334,30 +3333,30 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x14, x20\n"
"bne 139b\n"
"ldr q3, [x6, #0x0]\n"
- "ldr q2, [x6, #0x10]\n"
- "ldr q1, [x6, #0x20]\n"
- "ldr q0, [x6, #0x30]\n"
"add v8.4s, v8.4s, v3.4s\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr q2, [x6, #0x10]\n"
"add v9.4s, v9.4s, v2.4s\n"
- "prfm pstl1keep, [x15, #0x0]\n"
+ "ldr q1, [x6, #0x20]\n"
"add v10.4s, v10.4s, v1.4s\n"
+ "ldr q0, [x6, #0x30]\n"
"add v11.4s, v11.4s, v0.4s\n"
- "add v12.4s, v12.4s, v3.4s\n"
- "add x26, x15, x20\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x17, x20\n"
"add x24, x25, x20\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add x23, x24, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "add v12.4s, v12.4s, v3.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v13.4s, v13.4s, v2.4s\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"add v14.4s, v14.4s, v1.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"add v15.4s, v15.4s, v0.4s\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"add v16.4s, v16.4s, v3.4s\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"add v17.4s, v17.4s, v2.4s\n"
"add v18.4s, v18.4s, v1.4s\n"
"add v19.4s, v19.4s, v0.4s\n"
@@ -3387,9 +3386,9 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 151f\n"
"150:" // Height 6: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -3521,60 +3520,59 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v30.4s, v30.4s, v2.4s\n"
"srshl v31.4s, v31.4s, v3.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v2.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v0.4s\n"
+ "add v10.4s, v10.4s, v0.4s\n"
+ "add v11.4s, v11.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v13.4s, v13.4s, v0.4s\n"
+ "add v14.4s, v14.4s, v0.4s\n"
+ "add v15.4s, v15.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v0.4s\n"
+ "add v18.4s, v18.4s, v0.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v0.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v0.4s\n"
+ "smin v9.4s, v9.4s, v0.4s\n"
+ "smin v10.4s, v10.4s, v0.4s\n"
+ "smin v11.4s, v11.4s, v0.4s\n"
+ "smin v12.4s, v12.4s, v0.4s\n"
+ "smin v13.4s, v13.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v0.4s\n"
+ "smin v15.4s, v15.4s, v0.4s\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v1.4s }, [x21]\n"
- "cmp x17, #0x10\n"
"ld1r { v0.4s }, [x20]\n"
- "add v8.4s, v8.4s, v2.4s\n"
- "add v9.4s, v9.4s, v2.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v2.4s\n"
- "add v12.4s, v12.4s, v2.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v2.4s\n"
- "add v16.4s, v16.4s, v2.4s\n"
- "add v17.4s, v17.4s, v2.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v2.4s\n"
- "add v20.4s, v20.4s, v2.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v2.4s\n"
- "add v24.4s, v24.4s, v2.4s\n"
- "add v25.4s, v25.4s, v2.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v2.4s\n"
- "add v28.4s, v28.4s, v2.4s\n"
- "add v29.4s, v29.4s, v2.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v2.4s\n"
- "smin v8.4s, v8.4s, v1.4s\n"
- "smin v9.4s, v9.4s, v1.4s\n"
- "smin v10.4s, v10.4s, v1.4s\n"
- "smin v11.4s, v11.4s, v1.4s\n"
- "smin v12.4s, v12.4s, v1.4s\n"
- "smin v13.4s, v13.4s, v1.4s\n"
- "smin v14.4s, v14.4s, v1.4s\n"
- "smin v15.4s, v15.4s, v1.4s\n"
- "smin v16.4s, v16.4s, v1.4s\n"
- "smin v17.4s, v17.4s, v1.4s\n"
- "smin v18.4s, v18.4s, v1.4s\n"
- "smin v19.4s, v19.4s, v1.4s\n"
- "smin v20.4s, v20.4s, v1.4s\n"
- "smin v21.4s, v21.4s, v1.4s\n"
- "smin v22.4s, v22.4s, v1.4s\n"
- "smin v23.4s, v23.4s, v1.4s\n"
- "smin v24.4s, v24.4s, v1.4s\n"
- "smin v25.4s, v25.4s, v1.4s\n"
- "smin v26.4s, v26.4s, v1.4s\n"
- "smin v27.4s, v27.4s, v1.4s\n"
- "smin v28.4s, v28.4s, v1.4s\n"
- "smin v29.4s, v29.4s, v1.4s\n"
- "smin v30.4s, v30.4s, v1.4s\n"
- "smin v31.4s, v31.4s, v1.4s\n"
"smax v8.4s, v8.4s, v0.4s\n"
"smax v9.4s, v9.4s, v0.4s\n"
"smax v10.4s, v10.4s, v0.4s\n"
@@ -3611,6 +3609,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
"uzp1 v17.8h, v30.8h, v31.8h\n"
+ "cmp x16, #0x10\n"
"uzp1 v8.16b, v8.16b, v2.16b\n"
"uzp1 v12.16b, v12.16b, v1.16b\n"
"uzp1 v16.16b, v16.16b, v0.16b\n"
@@ -3618,136 +3617,136 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"uzp1 v24.16b, v24.16b, v18.16b\n"
"uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 161f\n"
- "tbz x17, #3, 156f\n"
- "str d8, [x15], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
- "tbz x17, #2, 154f\n"
- "st1 { v8.s }[2], [x15], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
- "tbz x17, #1, 153f\n"
- "st1 { v8.h }[6], [x15], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[14], [x15]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
+ "tbz x16, #3, 156f\n"
+ "str d8, [x17], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #2, 154f\n"
+ "st1 { v8.s }[2], [x17], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x16, #1, 153f\n"
+ "st1 { v8.h }[6], [x17], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[14], [x17]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 160f\n"
"153:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[12], [x15]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[12], [x17]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 160f\n"
"154:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x17, #1, 155f\n"
- "st1 { v8.h }[4], [x15], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[10], [x15]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
+ "tbz x16, #1, 155f\n"
+ "st1 { v8.h }[4], [x17], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[10], [x17]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 160f\n"
"155:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[8], [x15]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[8], [x17]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 160f\n"
"156:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x17, #2, 158f\n"
- "str s8, [x15], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
- "tbz x17, #1, 157f\n"
- "st1 { v8.h }[2], [x15], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[6], [x15]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
+ "tbz x16, #2, 158f\n"
+ "str s8, [x17], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x16, #1, 157f\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[6], [x17]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 160f\n"
"157:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[4], [x15]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[4], [x17]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 160f\n"
"158:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x17, #1, 159f\n"
- "str h8, [x15], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
- "tbz x17, #0, 160f\n"
- "st1 { v8.b }[2], [x15]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
+ "tbz x16, #1, 159f\n"
+ "str h8, [x17], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x16, #0, 160f\n"
+ "st1 { v8.b }[2], [x17]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 160f\n"
"159:" // Height 6: Partial direct writeback: partial_1_0
- "str b8, [x15, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
+ "str b8, [x17, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"160:" // Height 6: Partial direct writeback: Done
"b 162f\n"
"161:" // Height 6: Full writeback
- "str q8, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
+ "str q8, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"162:" // Height 6: Writeback done
- "subs x17, x17, #0x10\n"
+ "subs x16, x16, #0x10\n"
"bgt 137b\n"
"subs %x[M], %x[M], #0x6\n"
"beq 164f\n"
@@ -3761,8 +3760,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"164:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
index dbff7baee7..6cdca85bd2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -47,18 +47,18 @@ void a64_hybrid_s8qs_dot_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -97,9 +97,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -109,8 +109,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x28, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -125,102 +125,102 @@ void a64_hybrid_s8qs_dot_6x16 (
"cmp x27, #0x10\n"
"blt 9f\n"
"ldr q0, [x26, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
"cmp x27, #0x20\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
"blt 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "cmp x27, #0x20\n"
+ "add x9, x9, #0x100\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x9, x9, #0x100\n"
"9:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 14f\n"
"cmp x27, #0x4\n"
"blt 11f\n"
"10:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x26], #0x4\n"
- "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
"sub x27, x27, #0x4\n"
- "ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f92e228 // sdot v8.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "cmp x27, #0x4\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
+ "add x9, x9, #0x40\n"
"bge 10b\n"
"11:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 14f\n"
@@ -232,30 +232,30 @@ void a64_hybrid_s8qs_dot_6x16 (
"12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x26, #0x0]\n"
"13:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q17, [x9, #0x20]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "add x9, x9, #0x40\n"
"14:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 4b\n"
- "ldr q19, [x14, #0x0]\n"
- "ldr q18, [x14, #0x10]\n"
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q16, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v17.4s\n"
+ "add v9.4s, v9.4s, v16.4s\n"
"ldr q17, [x14, #0x20]\n"
"ldr q16, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "add v9.4s, v9.4s, v18.4s\n"
"add v10.4s, v10.4s, v17.4s\n"
"add v11.4s, v11.4s, v16.4s\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "add x14, x14, #0x40\n"
"tbz %x[flags], #4, 15f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -269,9 +269,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 16f\n"
"15:" // Height 1: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -298,21 +298,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v10.4s, v10.4s, v17.4s\n"
"sqadd v11.4s, v11.4s, v16.4s\n"
"17:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v18.4s }, [x21]\n"
- "ld1r { v17.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
- "ld1r { v16.4s }, [x20]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add v8.4s, v8.4s, v18.4s\n"
"add v9.4s, v9.4s, v18.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add v10.4s, v10.4s, v18.4s\n"
"add v11.4s, v11.4s, v18.4s\n"
+ "cmp x10, #0x10\n"
"smin v8.4s, v8.4s, v17.4s\n"
"smin v9.4s, v9.4s, v17.4s\n"
"smin v10.4s, v10.4s, v17.4s\n"
@@ -325,65 +325,65 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v16.8h, v10.8h, v11.8h\n"
"uzp1 v8.16b, v8.16b, v16.16b\n"
"bge 26f\n"
- "tbz x11, #3, 21f\n"
- "str d8, [x9], #0x8\n"
- "tbz x11, #2, 19f\n"
- "st1 { v8.s }[2], [x9], #0x4\n"
- "tbz x11, #1, 18f\n"
- "st1 { v8.h }[6], [x9], #0x2\n"
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[14], [x9]\n"
+ "tbz x10, #3, 21f\n"
+ "str d8, [x11], #0x8\n"
+ "tbz x10, #2, 19f\n"
+ "st1 { v8.s }[2], [x11], #0x4\n"
+ "tbz x10, #1, 18f\n"
+ "st1 { v8.h }[6], [x11], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[14], [x11]\n"
"b 25f\n"
"18:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[12], [x9]\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[12], [x11]\n"
"b 25f\n"
"19:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x11, #1, 20f\n"
- "st1 { v8.h }[4], [x9], #0x2\n"
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[10], [x9]\n"
+ "tbz x10, #1, 20f\n"
+ "st1 { v8.h }[4], [x11], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[10], [x11]\n"
"b 25f\n"
"20:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[8], [x9]\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[8], [x11]\n"
"b 25f\n"
"21:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x11, #2, 23f\n"
- "str s8, [x9], #0x4\n"
- "tbz x11, #1, 22f\n"
- "st1 { v8.h }[2], [x9], #0x2\n"
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[6], [x9]\n"
+ "tbz x10, #2, 23f\n"
+ "str s8, [x11], #0x4\n"
+ "tbz x10, #1, 22f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[6], [x11]\n"
"b 25f\n"
"22:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[4], [x9]\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[4], [x11]\n"
"b 25f\n"
"23:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x11, #1, 24f\n"
- "str h8, [x9], #0x2\n"
- "tbz x11, #0, 25f\n"
- "st1 { v8.b }[2], [x9]\n"
+ "tbz x10, #1, 24f\n"
+ "str h8, [x11], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[2], [x11]\n"
"b 25f\n"
"24:" // Height 1: Partial direct writeback: partial_1_0
- "str b8, [x9, #0x0]\n"
+ "str b8, [x11, #0x0]\n"
"25:" // Height 1: Partial direct writeback: Done
"b 27f\n"
"26:" // Height 1: Full writeback
- "str q8, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
"27:" // Height 1: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 2b\n"
"b 164f\n"
"28:" // Height 2
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"29:" // Height 2: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -397,8 +397,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x28, #0x0\n"
"31:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 32f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -418,117 +418,117 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q0, [x26, #0x0]\n"
"ldr q1, [x25, #0x0]\n"
"cmp x27, #0x20\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q7, [x9, #0x10]\n"
"blt 35f\n"
"34:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
"sub x27, x27, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "ldr q17, [x9, #0x40]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
+ "cmp x27, #0x20\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
"bge 34b\n"
"35:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
"add x26, x26, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
"add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
@@ -541,18 +541,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr s19, [x26], #0x4\n"
"ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
"cmp x27, #0x4\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
"bge 37b\n"
@@ -569,17 +569,17 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b0, [x26, #0x0]\n"
"ldr b1, [x25, #0x0]\n"
"40:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"41:" // Height 2: Multiply loop: No odd multiplies
@@ -589,20 +589,20 @@ void a64_hybrid_s8qs_dot_6x16 (
"bne 31b\n"
"ldr q19, [x14, #0x0]\n"
"ldr q18, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
"ldr q17, [x14, #0x20]\n"
"ldr q16, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
+ "add v10.4s, v10.4s, v17.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "add v9.4s, v9.4s, v18.4s\n"
+ "add x25, x11, x20\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add v12.4s, v12.4s, v19.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"add v13.4s, v13.4s, v18.4s\n"
- "add v10.4s, v10.4s, v17.4s\n"
- "add v11.4s, v11.4s, v16.4s\n"
- "add x26, x9, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"add v14.4s, v14.4s, v17.4s\n"
+ "add x14, x14, #0x40\n"
"add v15.4s, v15.4s, v16.4s\n"
"tbz %x[flags], #4, 42f\n"
"ldr q0, [x12, #0x0]\n"
@@ -617,9 +617,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 43f\n"
"42:" // Height 2: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -646,11 +646,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v19.4s\n"
- "and v19.16b, v12.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v18.4s\n"
- "and v18.16b, v13.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v17.4s\n"
"sqadd v11.4s, v11.4s, v16.4s\n"
+ "and v19.16b, v12.16b, v0.16b\n"
+ "and v18.16b, v13.16b, v1.16b\n"
"and v17.16b, v14.16b, v2.16b\n"
"and v16.16b, v15.16b, v3.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
@@ -662,21 +662,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v14.4s, v14.4s, v17.4s\n"
"sqadd v15.4s, v15.4s, v16.4s\n"
"44:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v18.4s }, [x21]\n"
- "ld1r { v17.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v16.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"add v8.4s, v8.4s, v18.4s\n"
"add v9.4s, v9.4s, v18.4s\n"
"add v10.4s, v10.4s, v18.4s\n"
@@ -708,81 +708,81 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v8.16b, v8.16b, v17.16b\n"
"uzp1 v12.16b, v12.16b, v16.16b\n"
"bge 53f\n"
- "tbz x11, #3, 48f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "tbz x11, #2, 46f\n"
- "st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "tbz x11, #1, 45f\n"
- "st1 { v8.h }[6], [x9], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[14], [x9]\n"
- "st1 { v12.b }[14], [x26]\n"
+ "tbz x10, #3, 48f\n"
+ "str d8, [x11], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "tbz x10, #2, 46f\n"
+ "st1 { v8.s }[2], [x11], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "tbz x10, #1, 45f\n"
+ "st1 { v8.h }[6], [x11], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[14], [x11]\n"
+ "st1 { v12.b }[14], [x25]\n"
"b 52f\n"
"45:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[12], [x9]\n"
- "st1 { v12.b }[12], [x26]\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[12], [x11]\n"
+ "st1 { v12.b }[12], [x25]\n"
"b 52f\n"
"46:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x11, #1, 47f\n"
- "st1 { v8.h }[4], [x9], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[10], [x9]\n"
- "st1 { v12.b }[10], [x26]\n"
+ "tbz x10, #1, 47f\n"
+ "st1 { v8.h }[4], [x11], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[10], [x11]\n"
+ "st1 { v12.b }[10], [x25]\n"
"b 52f\n"
"47:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[8], [x9]\n"
- "st1 { v12.b }[8], [x26]\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[8], [x11]\n"
+ "st1 { v12.b }[8], [x25]\n"
"b 52f\n"
"48:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x11, #2, 50f\n"
- "str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "tbz x11, #1, 49f\n"
- "st1 { v8.h }[2], [x9], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[6], [x9]\n"
- "st1 { v12.b }[6], [x26]\n"
+ "tbz x10, #2, 50f\n"
+ "str s8, [x11], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "tbz x10, #1, 49f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[6], [x11]\n"
+ "st1 { v12.b }[6], [x25]\n"
"b 52f\n"
"49:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[4], [x9]\n"
- "st1 { v12.b }[4], [x26]\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[4], [x11]\n"
+ "st1 { v12.b }[4], [x25]\n"
"b 52f\n"
"50:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x11, #1, 51f\n"
- "str h8, [x9], #0x2\n"
- "str h12, [x26], #0x2\n"
- "tbz x11, #0, 52f\n"
- "st1 { v8.b }[2], [x9]\n"
- "st1 { v12.b }[2], [x26]\n"
+ "tbz x10, #1, 51f\n"
+ "str h8, [x11], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[2], [x11]\n"
+ "st1 { v12.b }[2], [x25]\n"
"b 52f\n"
"51:" // Height 2: Partial direct writeback: partial_1_0
- "str b8, [x9, #0x0]\n"
- "str b12, [x26, #0x0]\n"
+ "str b8, [x11, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
"52:" // Height 2: Partial direct writeback: Done
"b 54f\n"
"53:" // Height 2: Full writeback
- "str q8, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q12, [x26, #0x0]\n"
+ "str q8, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q12, [x25, #0x0]\n"
"54:" // Height 2: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 29b\n"
"b 164f\n"
"55:" // Height 3
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"56:" // Height 3: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -800,8 +800,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x28, #0x0\n"
"58:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -825,8 +825,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q1, [x25, #0x0]\n"
"cmp x27, #0x20\n"
"ldr q2, [x24, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q7, [x9, #0x10]\n"
"blt 62f\n"
"61:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -834,77 +834,77 @@ void a64_hybrid_s8qs_dot_6x16 (
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
+ "ldr q21, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x25, x25, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
+ "ldr q20, [x9, #0x30]\n"
"add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q21, [x9, #0x40]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x50]\n"
+ "ldr q20, [x9, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x60]\n"
+ "ldr q21, [x9, #0x60]\n"
".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x70]\n"
+ "ldr q20, [x9, #0x70]\n"
".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x80]\n"
+ "ldr q21, [x9, #0x80]\n"
".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x90]\n"
+ "ldr q20, [x9, #0x90]\n"
".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xa0]\n"
+ "ldr q21, [x9, #0xa0]\n"
".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xb0]\n"
+ "ldr q20, [x9, #0xb0]\n"
".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xc0]\n"
+ "ldr q21, [x9, #0xc0]\n"
".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xd0]\n"
+ "ldr q20, [x9, #0xd0]\n"
".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x10, #0xe0]\n"
+ "ldr q21, [x9, #0xe0]\n"
".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q20, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
"bge 61b\n"
"62:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -912,65 +912,65 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
+ "ldr q21, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x24, x24, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
+ "ldr q20, [x9, #0x30]\n"
"sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q21, [x9, #0x40]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x50]\n"
+ "ldr q20, [x9, #0x50]\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x60]\n"
+ "ldr q21, [x9, #0x60]\n"
".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x70]\n"
+ "ldr q20, [x9, #0x70]\n"
".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x80]\n"
+ "ldr q21, [x9, #0x80]\n"
".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x90]\n"
+ "ldr q20, [x9, #0x90]\n"
".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xa0]\n"
+ "ldr q21, [x9, #0xa0]\n"
".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xb0]\n"
+ "ldr q20, [x9, #0xb0]\n"
".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xc0]\n"
+ "ldr q21, [x9, #0xc0]\n"
".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xd0]\n"
+ "ldr q20, [x9, #0xd0]\n"
".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x10, #0xe0]\n"
+ "ldr q21, [x9, #0xe0]\n"
".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q20, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
@@ -985,19 +985,19 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr s24, [x26], #0x4\n"
"ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr q21, [x10, #0x0]\n"
"cmp x27, #0x4\n"
- "ldr q20, [x10, #0x10]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x9, #0x0]\n"
".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
+ "ldr q20, [x9, #0x10]\n"
".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
+ "ldr q21, [x9, #0x20]\n"
".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q20, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
@@ -1021,17 +1021,17 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b1, [x25, #0x0]\n"
"ldr b2, [x24, #0x0]\n"
"67:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q21, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
+ "ldr q21, [x9, #0x0]\n"
+ "ldr q20, [x9, #0x10]\n"
".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
+ "ldr q21, [x9, #0x20]\n"
".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q20, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
@@ -1045,23 +1045,23 @@ void a64_hybrid_s8qs_dot_6x16 (
"bne 58b\n"
"ldr q23, [x14, #0x0]\n"
"ldr q22, [x14, #0x10]\n"
- "ldr q21, [x14, #0x20]\n"
- "ldr q20, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add v8.4s, v8.4s, v23.4s\n"
"add v9.4s, v9.4s, v22.4s\n"
- "add v12.4s, v12.4s, v23.4s\n"
- "add v13.4s, v13.4s, v22.4s\n"
+ "ldr q21, [x14, #0x20]\n"
+ "ldr q20, [x14, #0x30]\n"
"add v10.4s, v10.4s, v21.4s\n"
"add v11.4s, v11.4s, v20.4s\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add v12.4s, v12.4s, v23.4s\n"
+ "add v13.4s, v13.4s, v22.4s\n"
"add v14.4s, v14.4s, v21.4s\n"
"add v15.4s, v15.4s, v20.4s\n"
+ "add x14, x14, #0x40\n"
"add v16.4s, v16.4s, v23.4s\n"
"add v17.4s, v17.4s, v22.4s\n"
"add v18.4s, v18.4s, v21.4s\n"
@@ -1079,9 +1079,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 70f\n"
"69:" // Height 3: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -1112,11 +1112,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v23.4s\n"
- "and v23.16b, v12.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v22.4s\n"
- "and v22.16b, v13.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v21.4s\n"
"sqadd v11.4s, v11.4s, v20.4s\n"
+ "and v23.16b, v12.16b, v0.16b\n"
+ "and v22.16b, v13.16b, v1.16b\n"
"and v21.16b, v14.16b, v2.16b\n"
"and v20.16b, v15.16b, v3.16b\n"
"sshr v23.4s, v23.4s, #0x1f\n"
@@ -1124,11 +1124,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sqadd v12.4s, v12.4s, v23.4s\n"
- "and v23.16b, v16.16b, v0.16b\n"
"sqadd v13.4s, v13.4s, v22.4s\n"
- "and v22.16b, v17.16b, v1.16b\n"
"sqadd v14.4s, v14.4s, v21.4s\n"
"sqadd v15.4s, v15.4s, v20.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v1.16b\n"
"and v21.16b, v18.16b, v2.16b\n"
"and v20.16b, v19.16b, v3.16b\n"
"sshr v23.4s, v23.4s, #0x1f\n"
@@ -1140,21 +1140,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v18.4s, v18.4s, v21.4s\n"
"sqadd v19.4s, v19.4s, v20.4s\n"
"71:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x21]\n"
- "ld1r { v21.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v20.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
@@ -1205,97 +1205,97 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v12.16b, v12.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 80f\n"
- "tbz x11, #3, 75f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "tbz x11, #2, 73f\n"
- "st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "tbz x11, #1, 72f\n"
- "st1 { v8.h }[6], [x9], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[14], [x9]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
+ "tbz x10, #3, 75f\n"
+ "str d8, [x11], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "tbz x10, #2, 73f\n"
+ "st1 { v8.s }[2], [x11], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "tbz x10, #1, 72f\n"
+ "st1 { v8.h }[6], [x11], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[14], [x11]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
"b 79f\n"
"72:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[12], [x9]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[12], [x11]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
"b 79f\n"
"73:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x11, #1, 74f\n"
- "st1 { v8.h }[4], [x9], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[10], [x9]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
+ "tbz x10, #1, 74f\n"
+ "st1 { v8.h }[4], [x11], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[10], [x11]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
"b 79f\n"
"74:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[8], [x9]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[8], [x11]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
"b 79f\n"
"75:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x11, #2, 77f\n"
- "str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "tbz x11, #1, 76f\n"
- "st1 { v8.h }[2], [x9], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[6], [x9]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
+ "tbz x10, #2, 77f\n"
+ "str s8, [x11], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "tbz x10, #1, 76f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[6], [x11]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
"b 79f\n"
"76:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[4], [x9]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[4], [x11]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
"b 79f\n"
"77:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x11, #1, 78f\n"
- "str h8, [x9], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "tbz x11, #0, 79f\n"
- "st1 { v8.b }[2], [x9]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
+ "tbz x10, #1, 78f\n"
+ "str h8, [x11], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[2], [x11]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
"b 79f\n"
"78:" // Height 3: Partial direct writeback: partial_1_0
- "str b8, [x9, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
+ "str b8, [x11, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
"79:" // Height 3: Partial direct writeback: Done
"b 81f\n"
"80:" // Height 3: Full writeback
- "str q8, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
+ "str q8, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
"81:" // Height 3: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 56b\n"
"b 164f\n"
"82:" // Height 4
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"83:" // Height 4: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -1317,8 +1317,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x28, #0x0\n"
"85:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 86f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1346,8 +1346,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"cmp x27, #0x20\n"
"ldr q2, [x24, #0x0]\n"
"ldr q3, [x23, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q7, [x9, #0x10]\n"
"blt 89f\n"
"88:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -1356,7 +1356,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x26, x26, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
"add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1364,7 +1364,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x23, x23, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
"cmp x27, #0x20\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
@@ -1372,70 +1372,70 @@ void a64_hybrid_s8qs_dot_6x16 (
"prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x70]\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x80]\n"
+ "ldr q25, [x9, #0x80]\n"
".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x90]\n"
+ "ldr q24, [x9, #0x90]\n"
".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xa0]\n"
+ "ldr q25, [x9, #0xa0]\n"
".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xb0]\n"
+ "ldr q24, [x9, #0xb0]\n"
".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xc0]\n"
+ "ldr q25, [x9, #0xc0]\n"
".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xd0]\n"
+ "ldr q24, [x9, #0xd0]\n"
".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x10, #0xe0]\n"
+ "ldr q25, [x9, #0xe0]\n"
".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q24, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
@@ -1444,7 +1444,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q2, [x24, #0x0]\n"
".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
"bge 88b\n"
"89:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -1453,80 +1453,80 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x25, x25, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
"add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "ldr q24, [x9, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x70]\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x80]\n"
+ "ldr q25, [x9, #0x80]\n"
".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x90]\n"
+ "ldr q24, [x9, #0x90]\n"
".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xa0]\n"
+ "ldr q25, [x9, #0xa0]\n"
".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xb0]\n"
+ "ldr q24, [x9, #0xb0]\n"
".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xc0]\n"
+ "ldr q25, [x9, #0xc0]\n"
".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xd0]\n"
+ "ldr q24, [x9, #0xd0]\n"
".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x10, #0xe0]\n"
+ "ldr q25, [x9, #0xe0]\n"
".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q24, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
@@ -1543,22 +1543,22 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr s29, [x26], #0x4\n"
"ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s27, [x24], #0x4\n"
"ldr s26, [x23], #0x4\n"
- "cmp x27, #0x4\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q24, [x10, #0x10]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q24, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
@@ -1587,19 +1587,19 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b2, [x24, #0x0]\n"
"ldr b3, [x23, #0x0]\n"
"94:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q25, [x10, #0x0]\n"
- "ldr q24, [x10, #0x10]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q24, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
@@ -1615,24 +1615,24 @@ void a64_hybrid_s8qs_dot_6x16 (
"bne 85b\n"
"ldr q27, [x14, #0x0]\n"
"ldr q26, [x14, #0x10]\n"
- "ldr q25, [x14, #0x20]\n"
- "ldr q24, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add v8.4s, v8.4s, v27.4s\n"
"add v9.4s, v9.4s, v26.4s\n"
- "add v12.4s, v12.4s, v27.4s\n"
- "add v13.4s, v13.4s, v26.4s\n"
+ "ldr q25, [x14, #0x20]\n"
+ "ldr q24, [x14, #0x30]\n"
"add v10.4s, v10.4s, v25.4s\n"
"add v11.4s, v11.4s, v24.4s\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x11, x20\n"
"add x24, x25, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "add x23, x24, x20\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "add v14.4s, v14.4s, v25.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "add v12.4s, v12.4s, v27.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add v13.4s, v13.4s, v26.4s\n"
+ "add v14.4s, v14.4s, v25.4s\n"
+ "add x14, x14, #0x40\n"
"add v15.4s, v15.4s, v24.4s\n"
"add v16.4s, v16.4s, v27.4s\n"
"add v17.4s, v17.4s, v26.4s\n"
@@ -1655,9 +1655,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 97f\n"
"96:" // Height 4: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -1692,11 +1692,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v25.4s, v25.4s, #0x1f\n"
"sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v27.4s\n"
- "and v27.16b, v12.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v26.4s\n"
- "and v26.16b, v13.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v25.4s\n"
"sqadd v11.4s, v11.4s, v24.4s\n"
+ "and v27.16b, v12.16b, v0.16b\n"
+ "and v26.16b, v13.16b, v1.16b\n"
"and v25.16b, v14.16b, v2.16b\n"
"and v24.16b, v15.16b, v3.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
@@ -1704,11 +1704,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v25.4s, v25.4s, #0x1f\n"
"sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v12.4s, v12.4s, v27.4s\n"
- "and v27.16b, v16.16b, v0.16b\n"
"sqadd v13.4s, v13.4s, v26.4s\n"
- "and v26.16b, v17.16b, v1.16b\n"
"sqadd v14.4s, v14.4s, v25.4s\n"
"sqadd v15.4s, v15.4s, v24.4s\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v1.16b\n"
"and v25.16b, v18.16b, v2.16b\n"
"and v24.16b, v19.16b, v3.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
@@ -1716,11 +1716,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v25.4s, v25.4s, #0x1f\n"
"sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v27.4s\n"
- "and v27.16b, v20.16b, v0.16b\n"
"sqadd v17.4s, v17.4s, v26.4s\n"
- "and v26.16b, v21.16b, v1.16b\n"
"sqadd v18.4s, v18.4s, v25.4s\n"
"sqadd v19.4s, v19.4s, v24.4s\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v1.16b\n"
"and v25.16b, v22.16b, v2.16b\n"
"and v24.16b, v23.16b, v3.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
@@ -1732,21 +1732,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v22.4s, v22.4s, v25.4s\n"
"sqadd v23.4s, v23.4s, v24.4s\n"
"98:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
@@ -1816,113 +1816,113 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v16.16b, v16.16b, v18.16b\n"
"uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 107f\n"
- "tbz x11, #3, 102f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "tbz x11, #2, 100f\n"
- "st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "tbz x11, #1, 99f\n"
- "st1 { v8.h }[6], [x9], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[14], [x9]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
- "st1 { v20.b }[14], [x24]\n"
+ "tbz x10, #3, 102f\n"
+ "str d8, [x11], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x10, #2, 100f\n"
+ "st1 { v8.s }[2], [x11], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x10, #1, 99f\n"
+ "st1 { v8.h }[6], [x11], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[14], [x11]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 106f\n"
"99:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[12], [x9]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
- "st1 { v20.b }[12], [x24]\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[12], [x11]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 106f\n"
"100:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x11, #1, 101f\n"
- "st1 { v8.h }[4], [x9], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[10], [x9]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
- "st1 { v20.b }[10], [x24]\n"
+ "tbz x10, #1, 101f\n"
+ "st1 { v8.h }[4], [x11], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[10], [x11]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 106f\n"
"101:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[8], [x9]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
- "st1 { v20.b }[8], [x24]\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[8], [x11]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 106f\n"
"102:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x11, #2, 104f\n"
- "str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "tbz x11, #1, 103f\n"
- "st1 { v8.h }[2], [x9], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[6], [x9]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
- "st1 { v20.b }[6], [x24]\n"
+ "tbz x10, #2, 104f\n"
+ "str s8, [x11], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x10, #1, 103f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[6], [x11]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 106f\n"
"103:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[4], [x9]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
- "st1 { v20.b }[4], [x24]\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[4], [x11]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 106f\n"
"104:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x11, #1, 105f\n"
- "str h8, [x9], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "str h20, [x24], #0x2\n"
- "tbz x11, #0, 106f\n"
- "st1 { v8.b }[2], [x9]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
- "st1 { v20.b }[2], [x24]\n"
+ "tbz x10, #1, 105f\n"
+ "str h8, [x11], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[2], [x11]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 106f\n"
"105:" // Height 4: Partial direct writeback: partial_1_0
- "str b8, [x9, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
- "str b20, [x24, #0x0]\n"
+ "str b8, [x11, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"106:" // Height 4: Partial direct writeback: Done
"b 108f\n"
"107:" // Height 4: Full writeback
- "str q8, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
- "str q20, [x24, #0x0]\n"
+ "str q8, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"108:" // Height 4: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 83b\n"
"b 164f\n"
"109:" // Height 5
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"110:" // Height 5: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -1948,8 +1948,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x28, #0x0\n"
"112:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 113f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1981,8 +1981,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q2, [x24, #0x0]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q4, [x22, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q7, [x9, #0x10]\n"
"blt 116f\n"
"115:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -1994,7 +1994,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
+ "ldr q29, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x23, x23, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2003,7 +2003,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"cmp x27, #0x20\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
+ "ldr q28, [x9, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
@@ -2014,80 +2014,80 @@ void a64_hybrid_s8qs_dot_6x16 (
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x40]\n"
+ "ldr q29, [x9, #0x40]\n"
".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x50]\n"
+ "ldr q28, [x9, #0x50]\n"
".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x60]\n"
+ "ldr q29, [x9, #0x60]\n"
".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x70]\n"
+ "ldr q28, [x9, #0x70]\n"
".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x80]\n"
+ "ldr q29, [x9, #0x80]\n"
".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x90]\n"
+ "ldr q28, [x9, #0x90]\n"
".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xa0]\n"
+ "ldr q29, [x9, #0xa0]\n"
".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xb0]\n"
+ "ldr q28, [x9, #0xb0]\n"
".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xc0]\n"
+ "ldr q29, [x9, #0xc0]\n"
".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xd0]\n"
+ "ldr q28, [x9, #0xd0]\n"
".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x10, #0xe0]\n"
+ "ldr q29, [x9, #0xe0]\n"
".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q28, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
@@ -2098,7 +2098,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q3, [x23, #0x0]\n"
".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"ldr q4, [x22, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
"bge 115b\n"
"116:" // Height 5: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -2110,17 +2110,17 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
+ "ldr q29, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x22, x22, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "sub x27, x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
+ "ldr q28, [x9, #0x30]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
@@ -2129,74 +2129,74 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x40]\n"
+ "ldr q29, [x9, #0x40]\n"
".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x50]\n"
+ "ldr q28, [x9, #0x50]\n"
".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x60]\n"
+ "ldr q29, [x9, #0x60]\n"
".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x70]\n"
+ "ldr q28, [x9, #0x70]\n"
".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x80]\n"
+ "ldr q29, [x9, #0x80]\n"
".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x90]\n"
+ "ldr q28, [x9, #0x90]\n"
".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xa0]\n"
+ "ldr q29, [x9, #0xa0]\n"
".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xb0]\n"
+ "ldr q28, [x9, #0xb0]\n"
".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xc0]\n"
+ "ldr q29, [x9, #0xc0]\n"
".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xd0]\n"
+ "ldr q28, [x9, #0xd0]\n"
".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x10, #0xe0]\n"
+ "ldr q29, [x9, #0xe0]\n"
".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q28, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
@@ -2215,25 +2215,25 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s0, [x24], #0x4\n"
"ldr s31, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr s30, [x22], #0x4\n"
- "ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
+ "ldr q29, [x9, #0x0]\n"
".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ "ldr q28, [x9, #0x10]\n"
".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
+ "ldr q29, [x9, #0x20]\n"
".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q28, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
@@ -2267,21 +2267,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b3, [x23, #0x0]\n"
"ldr b4, [x22, #0x0]\n"
"121:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
+ "ldr q29, [x9, #0x0]\n"
+ "ldr q28, [x9, #0x10]\n"
".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
+ "ldr q29, [x9, #0x20]\n"
".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q28, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
@@ -2299,28 +2299,28 @@ void a64_hybrid_s8qs_dot_6x16 (
"bne 112b\n"
"ldr q31, [x14, #0x0]\n"
"ldr q30, [x14, #0x10]\n"
- "ldr q29, [x14, #0x20]\n"
- "ldr q28, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add v8.4s, v8.4s, v31.4s\n"
"add v9.4s, v9.4s, v30.4s\n"
- "add v12.4s, v12.4s, v31.4s\n"
- "add v13.4s, v13.4s, v30.4s\n"
+ "ldr q29, [x14, #0x20]\n"
+ "ldr q28, [x14, #0x30]\n"
"add v10.4s, v10.4s, v29.4s\n"
"add v11.4s, v11.4s, v28.4s\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x11, x20\n"
"add x24, x25, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add v14.4s, v14.4s, v29.4s\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add x23, x24, x20\n"
+ "add x22, x23, x20\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add v12.4s, v12.4s, v31.4s\n"
+ "add v13.4s, v13.4s, v30.4s\n"
+ "add v14.4s, v14.4s, v29.4s\n"
"add v15.4s, v15.4s, v28.4s\n"
+ "add x14, x14, #0x40\n"
"add v16.4s, v16.4s, v31.4s\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"add v17.4s, v17.4s, v30.4s\n"
"add v18.4s, v18.4s, v29.4s\n"
"add v19.4s, v19.4s, v28.4s\n"
@@ -2345,9 +2345,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 124f\n"
"123:" // Height 5: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -2386,11 +2386,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v31.4s\n"
- "and v31.16b, v12.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v30.4s\n"
- "and v30.16b, v13.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v29.4s\n"
"sqadd v11.4s, v11.4s, v28.4s\n"
+ "and v31.16b, v12.16b, v0.16b\n"
+ "and v30.16b, v13.16b, v1.16b\n"
"and v29.16b, v14.16b, v2.16b\n"
"and v28.16b, v15.16b, v3.16b\n"
"sshr v31.4s, v31.4s, #0x1f\n"
@@ -2398,11 +2398,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sqadd v12.4s, v12.4s, v31.4s\n"
- "and v31.16b, v16.16b, v0.16b\n"
"sqadd v13.4s, v13.4s, v30.4s\n"
- "and v30.16b, v17.16b, v1.16b\n"
"sqadd v14.4s, v14.4s, v29.4s\n"
"sqadd v15.4s, v15.4s, v28.4s\n"
+ "and v31.16b, v16.16b, v0.16b\n"
+ "and v30.16b, v17.16b, v1.16b\n"
"and v29.16b, v18.16b, v2.16b\n"
"and v28.16b, v19.16b, v3.16b\n"
"sshr v31.4s, v31.4s, #0x1f\n"
@@ -2410,11 +2410,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v31.4s\n"
- "and v31.16b, v20.16b, v0.16b\n"
"sqadd v17.4s, v17.4s, v30.4s\n"
- "and v30.16b, v21.16b, v1.16b\n"
"sqadd v18.4s, v18.4s, v29.4s\n"
"sqadd v19.4s, v19.4s, v28.4s\n"
+ "and v31.16b, v20.16b, v0.16b\n"
+ "and v30.16b, v21.16b, v1.16b\n"
"and v29.16b, v22.16b, v2.16b\n"
"and v28.16b, v23.16b, v3.16b\n"
"sshr v31.4s, v31.4s, #0x1f\n"
@@ -2422,11 +2422,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sqadd v20.4s, v20.4s, v31.4s\n"
- "and v31.16b, v24.16b, v0.16b\n"
"sqadd v21.4s, v21.4s, v30.4s\n"
- "and v30.16b, v25.16b, v1.16b\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"sqadd v23.4s, v23.4s, v28.4s\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v1.16b\n"
"and v29.16b, v26.16b, v2.16b\n"
"and v28.16b, v27.16b, v3.16b\n"
"sshr v31.4s, v31.4s, #0x1f\n"
@@ -2438,21 +2438,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v26.4s, v26.4s, v29.4s\n"
"sqadd v27.4s, v27.4s, v28.4s\n"
"125:" // Height 5: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x21]\n"
- "ld1r { v29.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v28.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
@@ -2541,133 +2541,132 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v20.16b, v20.16b, v18.16b\n"
"uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 134f\n"
- "tbz x11, #3, 129f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x11, #2, 127f\n"
- "st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x11, #1, 126f\n"
- "st1 { v8.h }[6], [x9], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[14], [x9]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "tbz x10, #3, 129f\n"
+ "str d8, [x11], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x10, #2, 127f\n"
+ "st1 { v8.s }[2], [x11], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x10, #1, 126f\n"
+ "st1 { v8.h }[6], [x11], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[14], [x11]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 133f\n"
"126:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[12], [x9]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[12], [x11]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 133f\n"
"127:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x11, #1, 128f\n"
- "st1 { v8.h }[4], [x9], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[10], [x9]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "tbz x10, #1, 128f\n"
+ "st1 { v8.h }[4], [x11], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[10], [x11]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 133f\n"
"128:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[8], [x9]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[8], [x11]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 133f\n"
"129:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x11, #2, 131f\n"
- "str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x11, #1, 130f\n"
- "st1 { v8.h }[2], [x9], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[6], [x9]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "tbz x10, #2, 131f\n"
+ "str s8, [x11], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x10, #1, 130f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[6], [x11]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 133f\n"
"130:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[4], [x9]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[4], [x11]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 133f\n"
"131:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x11, #1, 132f\n"
- "str h8, [x9], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x11, #0, 133f\n"
- "st1 { v8.b }[2], [x9]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "tbz x10, #1, 132f\n"
+ "str h8, [x11], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[2], [x11]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 133f\n"
"132:" // Height 5: Partial direct writeback: partial_1_0
- "str b8, [x9, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b8, [x11, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"133:" // Height 5: Partial direct writeback: Done
"b 135f\n"
"134:" // Height 5: Full writeback
- "str q8, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q8, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"135:" // Height 5: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 110b\n"
"b 164f\n"
"136:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x6\n"
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"137:" // Height 6: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -2697,8 +2696,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"mov x28, #0x0\n"
"139:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 140f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2734,8 +2733,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q3, [x23, #0x0]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q5, [x21, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q7, [x9, #0x10]\n"
"blt 143f\n"
"142:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -2748,7 +2747,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x24, x24, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
"add x23, x23, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2760,7 +2759,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
@@ -2772,92 +2771,92 @@ void a64_hybrid_s8qs_dot_6x16 (
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
@@ -2870,7 +2869,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr q4, [x22, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
"ldr q5, [x21, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
"bge 142b\n"
"143:" // Height 6: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
@@ -2883,108 +2882,108 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x23, x23, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
"add x22, x22, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
@@ -3005,28 +3004,28 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr s7, [x26], #0x4\n"
"ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s5, [x24], #0x4\n"
"ldr s4, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr s3, [x22], #0x4\n"
"ldr s2, [x21], #0x4\n"
- "ldr q1, [x10, #0x0]\n"
- "ldr q0, [x10, #0x10]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
- "ldr q1, [x10, #0x20]\n"
+ "ldr q1, [x9, #0x20]\n"
".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
- "ldr q0, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q0, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
@@ -3065,23 +3064,23 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b4, [x22, #0x0]\n"
"ldr b5, [x21, #0x0]\n"
"148:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x0]\n"
+ "ldr q6, [x9, #0x10]\n"
".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x20]\n"
+ "ldr q7, [x9, #0x20]\n"
".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
+ "ldr q6, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
@@ -3101,32 +3100,32 @@ void a64_hybrid_s8qs_dot_6x16 (
"bne 139b\n"
"ldr q3, [x14, #0x0]\n"
"ldr q2, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q0, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add v8.4s, v8.4s, v3.4s\n"
"add v9.4s, v9.4s, v2.4s\n"
- "add v12.4s, v12.4s, v3.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q0, [x14, #0x30]\n"
"add v10.4s, v10.4s, v1.4s\n"
"add v11.4s, v11.4s, v0.4s\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x11, x20\n"
"add x24, x25, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add v14.4s, v14.4s, v1.4s\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add x23, x24, x20\n"
+ "add x22, x23, x20\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add v12.4s, v12.4s, v3.4s\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add v13.4s, v13.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v1.4s\n"
+ "add x14, x14, #0x40\n"
"add v15.4s, v15.4s, v0.4s\n"
"add v16.4s, v16.4s, v3.4s\n"
- "add x22, x23, x20\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"add v17.4s, v17.4s, v2.4s\n"
"add v18.4s, v18.4s, v1.4s\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add v19.4s, v19.4s, v0.4s\n"
"add v20.4s, v20.4s, v3.4s\n"
"add v21.4s, v21.4s, v2.4s\n"
@@ -3153,9 +3152,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 151f\n"
"150:" // Height 6: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -3198,11 +3197,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v7.4s\n"
- "and v7.16b, v12.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v6.4s\n"
- "and v6.16b, v13.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v5.4s\n"
"sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v7.16b, v12.16b, v0.16b\n"
+ "and v6.16b, v13.16b, v1.16b\n"
"and v5.16b, v14.16b, v2.16b\n"
"and v4.16b, v15.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3210,11 +3209,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v12.4s, v12.4s, v7.4s\n"
- "and v7.16b, v16.16b, v0.16b\n"
"sqadd v13.4s, v13.4s, v6.4s\n"
- "and v6.16b, v17.16b, v1.16b\n"
"sqadd v14.4s, v14.4s, v5.4s\n"
"sqadd v15.4s, v15.4s, v4.4s\n"
+ "and v7.16b, v16.16b, v0.16b\n"
+ "and v6.16b, v17.16b, v1.16b\n"
"and v5.16b, v18.16b, v2.16b\n"
"and v4.16b, v19.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3222,11 +3221,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v7.4s\n"
- "and v7.16b, v20.16b, v0.16b\n"
"sqadd v17.4s, v17.4s, v6.4s\n"
- "and v6.16b, v21.16b, v1.16b\n"
"sqadd v18.4s, v18.4s, v5.4s\n"
"sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v7.16b, v20.16b, v0.16b\n"
+ "and v6.16b, v21.16b, v1.16b\n"
"and v5.16b, v22.16b, v2.16b\n"
"and v4.16b, v23.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3234,11 +3233,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v20.4s, v20.4s, v7.4s\n"
- "and v7.16b, v24.16b, v0.16b\n"
"sqadd v21.4s, v21.4s, v6.4s\n"
- "and v6.16b, v25.16b, v1.16b\n"
"sqadd v22.4s, v22.4s, v5.4s\n"
"sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v7.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v1.16b\n"
"and v5.16b, v26.16b, v2.16b\n"
"and v4.16b, v27.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3246,11 +3245,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v24.4s, v24.4s, v7.4s\n"
- "and v7.16b, v28.16b, v0.16b\n"
"sqadd v25.4s, v25.4s, v6.4s\n"
- "and v6.16b, v29.16b, v1.16b\n"
"sqadd v26.4s, v26.4s, v5.4s\n"
"sqadd v27.4s, v27.4s, v4.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
+ "and v6.16b, v29.16b, v1.16b\n"
"and v5.16b, v30.16b, v2.16b\n"
"and v4.16b, v31.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3262,21 +3261,21 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v30.4s, v30.4s, v5.4s\n"
"sqadd v31.4s, v31.4s, v4.4s\n"
"152:" // Height 6: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v6.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x21]\n"
- "ld1r { v5.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v4.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
@@ -3384,136 +3383,136 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v24.16b, v24.16b, v18.16b\n"
"uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 161f\n"
- "tbz x11, #3, 156f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
- "tbz x11, #2, 154f\n"
- "st1 { v8.s }[2], [x9], #0x4\n"
- "st1 { v12.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
- "tbz x11, #1, 153f\n"
- "st1 { v8.h }[6], [x9], #0x2\n"
- "st1 { v12.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[14], [x9]\n"
- "st1 { v12.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
+ "tbz x10, #3, 156f\n"
+ "str d8, [x11], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x10, #2, 154f\n"
+ "st1 { v8.s }[2], [x11], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x10, #1, 153f\n"
+ "st1 { v8.h }[6], [x11], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[14], [x11]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 160f\n"
"153:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[12], [x9]\n"
- "st1 { v12.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[12], [x11]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 160f\n"
"154:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x11, #1, 155f\n"
- "st1 { v8.h }[4], [x9], #0x2\n"
- "st1 { v12.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[10], [x9]\n"
- "st1 { v12.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
+ "tbz x10, #1, 155f\n"
+ "st1 { v8.h }[4], [x11], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[10], [x11]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 160f\n"
"155:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[8], [x9]\n"
- "st1 { v12.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[8], [x11]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 160f\n"
"156:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x11, #2, 158f\n"
- "str s8, [x9], #0x4\n"
- "str s12, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
- "tbz x11, #1, 157f\n"
- "st1 { v8.h }[2], [x9], #0x2\n"
- "st1 { v12.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[6], [x9]\n"
- "st1 { v12.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
+ "tbz x10, #2, 158f\n"
+ "str s8, [x11], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x10, #1, 157f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[6], [x11]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 160f\n"
"157:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[4], [x9]\n"
- "st1 { v12.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[4], [x11]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 160f\n"
"158:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x11, #1, 159f\n"
- "str h8, [x9], #0x2\n"
- "str h12, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
- "tbz x11, #0, 160f\n"
- "st1 { v8.b }[2], [x9]\n"
- "st1 { v12.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
+ "tbz x10, #1, 159f\n"
+ "str h8, [x11], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[2], [x11]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 160f\n"
"159:" // Height 6: Partial direct writeback: partial_1_0
- "str b8, [x9, #0x0]\n"
- "str b12, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
+ "str b8, [x11, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"160:" // Height 6: Partial direct writeback: Done
"b 162f\n"
"161:" // Height 6: Full writeback
- "str q8, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q12, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
+ "str q8, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"162:" // Height 6: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 137b\n"
"subs %x[M], %x[M], #0x6\n"
"beq 164f\n"
@@ -3527,8 +3526,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"164:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
index 494370ade7..65f654012e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
index 867bcded1f..5d34a5e9d2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
@@ -47,18 +47,18 @@ void a64_hybrid_s8qs_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -97,9 +97,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -113,8 +113,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x28, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -129,115 +129,115 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x10\n"
"blt 9f\n"
"ldr q1, [x26, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
"cmp x27, #0x20\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x10]\n"
"blt 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"trn1 v18.2d, v1.2d, v21.2d\n"
- "trn2 v1.2d, v1.2d, v21.2d\n"
".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "cmp x27, #0x20\n"
".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
- "ldr q7, [x10, #0x0]\n"
".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
"ldr q1, [x26, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "add x9, x9, #0x100\n"
+ "ldr q7, [x9, #0x0]\n"
+ "ldr q6, [x9, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "trn1 v18.2d, v1.2d, v16.2d\n"
- "trn2 v1.2d, v1.2d, v16.2d\n"
+ "trn1 v18.2d, v1.2d, v19.2d\n"
".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v19.2d\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x9, x9, #0x100\n"
"9:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 16f\n"
"cmp x27, #0x8\n"
"blt 11f\n"
"10:" // Height 1: Multiply loop: Odd block loop
"ldr d18, [x26], #0x8\n"
- "ldr q19, [x10, #0x0]\n"
- "sub x27, x27, #0x8\n"
- "ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v18.2d, v18.2d, v17.2d\n"
- ".inst 0x4e93a648 // smmla v8.4s, v18.16b, v19.16b\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "trn1 v18.2d, v18.2d, v16.2d\n"
+ "ldr q31, [x9, #0x10]\n"
+ ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e9fa64c // smmla v12.4s, v18.16b, v31.16b\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q16, [x9, #0x70]\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "add x9, x9, #0x80\n"
"bge 10b\n"
"11:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 16f\n"
@@ -261,24 +261,24 @@ void a64_hybrid_s8qs_mmla_6x16 (
"14:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x26, #0x0]\n"
"15:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q17, [x10, #0x0]\n"
- "ldr q19, [x10, #0x10]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q19, [x9, #0x10]\n"
"trn1 v18.2d, v1.2d, v16.2d\n"
".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4e93a64c // smmla v12.4s, v18.16b, v19.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q16, [x9, #0x70]\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "add x9, x9, #0x80\n"
"16:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -292,11 +292,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q16, [x14, #0x30]\n"
"uzp1 v10.2d, v10.2d, v14.2d\n"
"uzp1 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x14, x14, #0x40\n"
"mov v15.16b, v8.16b\n"
- "add v9.4s, v9.4s, v18.4s\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"add v15.4s, v15.4s, v19.4s\n"
+ "add x14, x14, #0x40\n"
+ "add v9.4s, v9.4s, v18.4s\n"
"add v10.4s, v10.4s, v17.4s\n"
"add v11.4s, v11.4s, v16.4s\n"
"tbz %x[flags], #4, 17f\n"
@@ -312,9 +312,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 18f\n"
"17:" // Height 1: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -328,34 +328,34 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
"sqrdmulh v11.4s, v11.4s, v7.4s\n"
"tbz %x[flags], #5, 19f\n"
- "and v18.16b, v15.16b, v0.16b\n"
- "and v17.16b, v9.16b, v1.16b\n"
- "and v26.16b, v10.16b, v2.16b\n"
- "and v16.16b, v11.16b, v3.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v15.16b, v0.16b\n"
+ "and v16.16b, v9.16b, v1.16b\n"
+ "and v25.16b, v10.16b, v2.16b\n"
+ "and v18.16b, v11.16b, v3.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v18.4s\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v17.4s\n"
+ "sqadd v9.4s, v9.4s, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v18.4s\n"
"19:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v18.4s }, [x21]\n"
- "ld1r { v17.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
- "ld1r { v16.4s }, [x20]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add v15.4s, v15.4s, v18.4s\n"
"add v9.4s, v9.4s, v18.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add v10.4s, v10.4s, v18.4s\n"
"add v11.4s, v11.4s, v18.4s\n"
+ "cmp x10, #0x10\n"
"smin v15.4s, v15.4s, v17.4s\n"
"smin v9.4s, v9.4s, v17.4s\n"
"smin v10.4s, v10.4s, v17.4s\n"
@@ -368,65 +368,65 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v16.8h, v10.8h, v11.8h\n"
"uzp1 v15.16b, v15.16b, v16.16b\n"
"bge 28f\n"
- "tbz x11, #3, 23f\n"
- "str d15, [x9], #0x8\n"
- "tbz x11, #2, 21f\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "tbz x11, #1, 20f\n"
- "st1 { v15.h }[6], [x9], #0x2\n"
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[14], [x9]\n"
+ "tbz x10, #3, 23f\n"
+ "str d15, [x11], #0x8\n"
+ "tbz x10, #2, 21f\n"
+ "st1 { v15.s }[2], [x11], #0x4\n"
+ "tbz x10, #1, 20f\n"
+ "st1 { v15.h }[6], [x11], #0x2\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[14], [x11]\n"
"b 27f\n"
"20:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[12], [x9]\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[12], [x11]\n"
"b 27f\n"
"21:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x11, #1, 22f\n"
- "st1 { v15.h }[4], [x9], #0x2\n"
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[10], [x9]\n"
+ "tbz x10, #1, 22f\n"
+ "st1 { v15.h }[4], [x11], #0x2\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[10], [x11]\n"
"b 27f\n"
"22:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[8], [x9]\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[8], [x11]\n"
"b 27f\n"
"23:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x11, #2, 25f\n"
- "str s15, [x9], #0x4\n"
- "tbz x11, #1, 24f\n"
- "st1 { v15.h }[2], [x9], #0x2\n"
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[6], [x9]\n"
+ "tbz x10, #2, 25f\n"
+ "str s15, [x11], #0x4\n"
+ "tbz x10, #1, 24f\n"
+ "st1 { v15.h }[2], [x11], #0x2\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[6], [x11]\n"
"b 27f\n"
"24:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[4], [x9]\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[4], [x11]\n"
"b 27f\n"
"25:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x11, #1, 26f\n"
- "str h15, [x9], #0x2\n"
- "tbz x11, #0, 27f\n"
- "st1 { v15.b }[2], [x9]\n"
+ "tbz x10, #1, 26f\n"
+ "str h15, [x11], #0x2\n"
+ "tbz x10, #0, 27f\n"
+ "st1 { v15.b }[2], [x11]\n"
"b 27f\n"
"26:" // Height 1: Partial direct writeback: partial_1_0
- "str b15, [x9, #0x0]\n"
+ "str b15, [x11, #0x0]\n"
"27:" // Height 1: Partial direct writeback: Done
"b 29f\n"
"28:" // Height 1: Full writeback
- "str q15, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
+ "str q15, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
"29:" // Height 1: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 2b\n"
"b 176f\n"
"30:" // Height 2
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"31:" // Height 2: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -440,8 +440,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x28, #0x0\n"
"33:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 34f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -461,120 +461,120 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q1, [x26, #0x0]\n"
"ldr q2, [x25, #0x0]\n"
"cmp x27, #0x20\n"
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x0]\n"
+ "ldr q6, [x9, #0x10]\n"
"blt 37f\n"
"36:" // Height 2: Multiply loop: Main loop head
"trn1 v18.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q2, [x25, #0x0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
- "ldr q16, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e90a649 // smmla v9.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x40]\n"
- ".inst 0x4e91a64d // smmla v13.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e90a64a // smmla v10.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x60]\n"
- ".inst 0x4e91a64e // smmla v14.4s, v18.16b, v17.16b\n"
- "ldr q31, [x10, #0x70]\n"
- ".inst 0x4e90a64b // smmla v11.4s, v18.16b, v16.16b\n"
- "ldr q17, [x10, #0x80]\n"
- ".inst 0x4e9fa64f // smmla v15.4s, v18.16b, v31.16b\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "ldr q2, [x25, #0x0]\n"
+ "cmp x27, #0x20\n"
".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
- "ldr q7, [x10, #0x0]\n"
+ "add x9, x9, #0x100\n"
+ "ldr q7, [x9, #0x0]\n"
".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
"ldr q1, [x26, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"bge 36b\n"
"37:" // Height 2: Multiply loop: Single iteration only
"trn1 v18.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
+ "ldr q16, [x9, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x80]\n"
+ "ldr q17, [x9, #0x80]\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x90]\n"
+ "ldr q16, [x9, #0x90]\n"
".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xa0]\n"
+ "ldr q17, [x9, #0xa0]\n"
".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xb0]\n"
+ "ldr q16, [x9, #0xb0]\n"
".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xc0]\n"
+ "ldr q17, [x9, #0xc0]\n"
".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xd0]\n"
+ "ldr q16, [x9, #0xd0]\n"
".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xe0]\n"
+ "ldr q17, [x9, #0xe0]\n"
".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q16, [x9, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x9, x9, #0x100\n"
"38:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 45f\n"
"cmp x27, #0x8\n"
"blt 40f\n"
"39:" // Height 2: Multiply loop: Odd block loop
- "ldr d19, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d16, [x25], #0x8\n"
+ "trn1 v18.2d, v17.2d, v16.2d\n"
"sub x27, x27, #0x8\n"
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v18.2d, v19.2d, v18.2d\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x20]\n"
".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q17, [x9, #0x20]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q17, [x9, #0x40]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q17, [x9, #0x60]\n"
+ "ldr q16, [x9, #0x70]\n"
+ "cmp x27, #0x8\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "add x9, x9, #0x80\n"
"bge 39b\n"
"40:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 45f\n"
@@ -605,24 +605,24 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b1, [x26, #0x0]\n"
"ldr b2, [x25, #0x0]\n"
"44:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
"trn1 v18.2d, v1.2d, v2.2d\n"
".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x40]\n"
+ "ldr q17, [x9, #0x40]\n"
".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x50]\n"
+ "ldr q16, [x9, #0x50]\n"
".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
- "ldr q17, [x10, #0x60]\n"
+ "ldr q17, [x9, #0x60]\n"
".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
- "ldr q16, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q16, [x9, #0x70]\n"
".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "add x9, x9, #0x80\n"
"45:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -639,18 +639,18 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
+ "add x25, x11, x20\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x14, x14, #0x40\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"mov v15.16b, v17.16b\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "add x14, x14, #0x40\n"
"add v12.4s, v12.4s, v18.4s\n"
- "add x26, x9, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"add v13.4s, v13.4s, v5.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "add v15.4s, v15.4s, v19.4s\n"
"add v14.4s, v14.4s, v16.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
"add v9.4s, v9.4s, v18.4s\n"
"add v10.4s, v10.4s, v5.4s\n"
"add v11.4s, v11.4s, v16.4s\n"
@@ -667,9 +667,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 47f\n"
"46:" // Height 2: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -696,11 +696,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v15.4s, v15.4s, v19.4s\n"
- "and v19.16b, v8.16b, v0.16b\n"
"sqadd v12.4s, v12.4s, v18.4s\n"
- "and v18.16b, v9.16b, v1.16b\n"
"sqadd v13.4s, v13.4s, v17.4s\n"
"sqadd v14.4s, v14.4s, v16.4s\n"
+ "and v19.16b, v8.16b, v0.16b\n"
+ "and v18.16b, v9.16b, v1.16b\n"
"and v17.16b, v10.16b, v2.16b\n"
"and v16.16b, v11.16b, v3.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
@@ -712,21 +712,21 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqadd v10.4s, v10.4s, v17.4s\n"
"sqadd v11.4s, v11.4s, v16.4s\n"
"48:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v18.4s }, [x21]\n"
- "ld1r { v17.4s }, [x20]\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v16.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"add v15.4s, v15.4s, v18.4s\n"
"add v12.4s, v12.4s, v18.4s\n"
"add v13.4s, v13.4s, v18.4s\n"
@@ -758,81 +758,81 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v15.16b, v15.16b, v17.16b\n"
"uzp1 v8.16b, v8.16b, v16.16b\n"
"bge 57f\n"
- "tbz x11, #3, 52f\n"
- "str d15, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "tbz x11, #2, 50f\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "st1 { v8.s }[2], [x26], #0x4\n"
- "tbz x11, #1, 49f\n"
- "st1 { v15.h }[6], [x9], #0x2\n"
- "st1 { v8.h }[6], [x26], #0x2\n"
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[14], [x9]\n"
- "st1 { v8.b }[14], [x26]\n"
+ "tbz x10, #3, 52f\n"
+ "str d15, [x11], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "tbz x10, #2, 50f\n"
+ "st1 { v15.s }[2], [x11], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "tbz x10, #1, 49f\n"
+ "st1 { v15.h }[6], [x11], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[14], [x11]\n"
+ "st1 { v8.b }[14], [x25]\n"
"b 56f\n"
"49:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[12], [x9]\n"
- "st1 { v8.b }[12], [x26]\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[12], [x11]\n"
+ "st1 { v8.b }[12], [x25]\n"
"b 56f\n"
"50:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x11, #1, 51f\n"
- "st1 { v15.h }[4], [x9], #0x2\n"
- "st1 { v8.h }[4], [x26], #0x2\n"
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[10], [x9]\n"
- "st1 { v8.b }[10], [x26]\n"
+ "tbz x10, #1, 51f\n"
+ "st1 { v15.h }[4], [x11], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[10], [x11]\n"
+ "st1 { v8.b }[10], [x25]\n"
"b 56f\n"
"51:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[8], [x9]\n"
- "st1 { v8.b }[8], [x26]\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[8], [x11]\n"
+ "st1 { v8.b }[8], [x25]\n"
"b 56f\n"
"52:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x11, #2, 54f\n"
- "str s15, [x9], #0x4\n"
- "str s8, [x26], #0x4\n"
- "tbz x11, #1, 53f\n"
- "st1 { v15.h }[2], [x9], #0x2\n"
- "st1 { v8.h }[2], [x26], #0x2\n"
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[6], [x9]\n"
- "st1 { v8.b }[6], [x26]\n"
+ "tbz x10, #2, 54f\n"
+ "str s15, [x11], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "tbz x10, #1, 53f\n"
+ "st1 { v15.h }[2], [x11], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[6], [x11]\n"
+ "st1 { v8.b }[6], [x25]\n"
"b 56f\n"
"53:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[4], [x9]\n"
- "st1 { v8.b }[4], [x26]\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[4], [x11]\n"
+ "st1 { v8.b }[4], [x25]\n"
"b 56f\n"
"54:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x11, #1, 55f\n"
- "str h15, [x9], #0x2\n"
- "str h8, [x26], #0x2\n"
- "tbz x11, #0, 56f\n"
- "st1 { v15.b }[2], [x9]\n"
- "st1 { v8.b }[2], [x26]\n"
+ "tbz x10, #1, 55f\n"
+ "str h15, [x11], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "tbz x10, #0, 56f\n"
+ "st1 { v15.b }[2], [x11]\n"
+ "st1 { v8.b }[2], [x25]\n"
"b 56f\n"
"55:" // Height 2: Partial direct writeback: partial_1_0
- "str b15, [x9, #0x0]\n"
- "str b8, [x26, #0x0]\n"
+ "str b15, [x11, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
"56:" // Height 2: Partial direct writeback: Done
"b 58f\n"
"57:" // Height 2: Full writeback
- "str q15, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q8, [x26, #0x0]\n"
+ "str q15, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q8, [x25, #0x0]\n"
"58:" // Height 2: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 31b\n"
"b 176f\n"
"59:" // Height 3
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"60:" // Height 3: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -854,8 +854,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x28, #0x0\n"
"62:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 63f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -879,130 +879,130 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q2, [x25, #0x0]\n"
"cmp x27, #0x20\n"
"ldr q3, [x24, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x0]\n"
+ "ldr q6, [x9, #0x10]\n"
"blt 66f\n"
"65:" // Height 3: Multiply loop: Main loop head
"trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v26.2d, v3.2d, v24.2d\n"
".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
- ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn2 v3.2d, v3.2d, v24.2d\n"
+ "trn1 v26.2d, v3.2d, v28.2d\n"
".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ "trn2 v3.2d, v3.2d, v28.2d\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x80]\n"
+ "ldr q25, [x9, #0x80]\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x90]\n"
+ "ldr q24, [x9, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xa0]\n"
+ "ldr q25, [x9, #0xa0]\n"
".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xb0]\n"
+ "ldr q24, [x9, #0xb0]\n"
".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xc0]\n"
+ "ldr q25, [x9, #0xc0]\n"
".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xd0]\n"
+ "ldr q24, [x9, #0xd0]\n"
".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xe0]\n"
+ "ldr q25, [x9, #0xe0]\n"
".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q24, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
"ldr q1, [x26, #0x0]\n"
".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n"
"ldr q3, [x24, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x10]\n"
"bge 65b\n"
"66:" // Height 3: Multiply loop: Single iteration only
"trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn1 v26.2d, v3.2d, v24.2d\n"
".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
- ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
- "trn2 v3.2d, v3.2d, v24.2d\n"
+ "trn1 v26.2d, v3.2d, v25.2d\n"
".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
- "ldr q24, [x10, #0x20]\n"
+ "ldr q24, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
- "ldr q4, [x10, #0x30]\n"
+ "ldr q0, [x9, #0x30]\n"
".inst 0x4e98a769 // smmla v9.4s, v27.16b, v24.16b\n"
+ "trn2 v3.2d, v3.2d, v25.2d\n"
".inst 0x4e98a751 // smmla v17.4s, v26.16b, v24.16b\n"
- "ldr q25, [x10, #0x40]\n"
- ".inst 0x4e84a76d // smmla v13.4s, v27.16b, v4.16b\n"
- ".inst 0x4e84a755 // smmla v21.4s, v26.16b, v4.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e80a76d // smmla v13.4s, v27.16b, v0.16b\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4e80a755 // smmla v21.4s, v26.16b, v0.16b\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x80]\n"
+ "ldr q25, [x9, #0x80]\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x90]\n"
+ "ldr q24, [x9, #0x90]\n"
".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xa0]\n"
+ "ldr q25, [x9, #0xa0]\n"
".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xb0]\n"
+ "ldr q24, [x9, #0xb0]\n"
".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xc0]\n"
+ "ldr q25, [x9, #0xc0]\n"
".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xd0]\n"
+ "ldr q24, [x9, #0xd0]\n"
".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xe0]\n"
+ "ldr q25, [x9, #0xe0]\n"
".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q24, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
@@ -1012,35 +1012,35 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 69f\n"
"68:" // Height 3: Multiply loop: Odd block loop
- "ldr d29, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d26, [x24], #0x8\n"
- "ldr q25, [x10, #0x0]\n"
- "cmp x27, #0x8\n"
- "ldr q24, [x10, #0x10]\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
- "trn1 v26.2d, v26.2d, v28.2d\n"
+ "ldr d25, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "trn1 v27.2d, v25.2d, v24.2d\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr q25, [x9, #0x0]\n"
+ "trn1 v26.2d, v24.2d, v26.2d\n"
".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
+ "ldr q24, [x9, #0x10]\n"
".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n"
".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ "sub x27, x27, #0x8\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ "cmp x27, #0x8\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
@@ -1081,29 +1081,29 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b2, [x25, #0x0]\n"
"ldr b3, [x24, #0x0]\n"
"73:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q25, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q28, [x9, #0x10]\n"
"trn1 v27.2d, v1.2d, v2.2d\n"
"trn1 v26.2d, v3.2d, v24.2d\n"
".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
- ".inst 0x4e9ca76c // smmla v12.4s, v27.16b, v28.16b\n"
".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e9ca76c // smmla v12.4s, v27.16b, v28.16b\n"
".inst 0x4e9ca754 // smmla v20.4s, v26.16b, v28.16b\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q24, [x9, #0x70]\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
@@ -1124,23 +1124,23 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
+ "add x25, x11, x20\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x14, x14, #0x40\n"
+ "add x24, x25, x20\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
"uzp1 v19.2d, v19.2d, v23.2d\n"
- "add x26, x9, x20\n"
+ "add x14, x14, #0x40\n"
"mov v23.16b, v26.16b\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add v23.4s, v23.4s, v28.4s\n"
"add v12.4s, v12.4s, v27.4s\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"add v13.4s, v13.4s, v25.4s\n"
"add v14.4s, v14.4s, v24.4s\n"
- "add v23.4s, v23.4s, v28.4s\n"
"add v8.4s, v8.4s, v28.4s\n"
"add v9.4s, v9.4s, v27.4s\n"
"add v10.4s, v10.4s, v25.4s\n"
@@ -1162,9 +1162,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 76f\n"
"75:" // Height 3: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -1195,11 +1195,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sqadd v23.4s, v23.4s, v24.4s\n"
- "and v24.16b, v8.16b, v0.16b\n"
"sqadd v12.4s, v12.4s, v22.4s\n"
- "and v22.16b, v9.16b, v1.16b\n"
"sqadd v13.4s, v13.4s, v21.4s\n"
"sqadd v14.4s, v14.4s, v20.4s\n"
+ "and v24.16b, v8.16b, v0.16b\n"
+ "and v22.16b, v9.16b, v1.16b\n"
"and v21.16b, v10.16b, v2.16b\n"
"and v20.16b, v11.16b, v3.16b\n"
"sshr v24.4s, v24.4s, #0x1f\n"
@@ -1207,11 +1207,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v24.4s\n"
- "and v24.16b, v16.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v22.4s\n"
- "and v22.16b, v17.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v21.4s\n"
"sqadd v11.4s, v11.4s, v20.4s\n"
+ "and v24.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v1.16b\n"
"and v21.16b, v18.16b, v2.16b\n"
"and v20.16b, v19.16b, v3.16b\n"
"sshr v24.4s, v24.4s, #0x1f\n"
@@ -1223,21 +1223,21 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqadd v18.4s, v18.4s, v21.4s\n"
"sqadd v19.4s, v19.4s, v20.4s\n"
"77:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x21]\n"
- "ld1r { v21.4s }, [x20]\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v20.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
@@ -1288,97 +1288,97 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v8.16b, v8.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 86f\n"
- "tbz x11, #3, 81f\n"
- "str d23, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d16, [x25], #0x8\n"
- "tbz x11, #2, 79f\n"
- "st1 { v23.s }[2], [x9], #0x4\n"
- "st1 { v8.s }[2], [x26], #0x4\n"
- "st1 { v16.s }[2], [x25], #0x4\n"
- "tbz x11, #1, 78f\n"
- "st1 { v23.h }[6], [x9], #0x2\n"
- "st1 { v8.h }[6], [x26], #0x2\n"
- "st1 { v16.h }[6], [x25], #0x2\n"
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[14], [x9]\n"
- "st1 { v8.b }[14], [x26]\n"
- "st1 { v16.b }[14], [x25]\n"
+ "tbz x10, #3, 81f\n"
+ "str d23, [x11], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "tbz x10, #2, 79f\n"
+ "st1 { v23.s }[2], [x11], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "tbz x10, #1, 78f\n"
+ "st1 { v23.h }[6], [x11], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[14], [x11]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
"b 85f\n"
"78:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[12], [x9]\n"
- "st1 { v8.b }[12], [x26]\n"
- "st1 { v16.b }[12], [x25]\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[12], [x11]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
"b 85f\n"
"79:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x11, #1, 80f\n"
- "st1 { v23.h }[4], [x9], #0x2\n"
- "st1 { v8.h }[4], [x26], #0x2\n"
- "st1 { v16.h }[4], [x25], #0x2\n"
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[10], [x9]\n"
- "st1 { v8.b }[10], [x26]\n"
- "st1 { v16.b }[10], [x25]\n"
+ "tbz x10, #1, 80f\n"
+ "st1 { v23.h }[4], [x11], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[10], [x11]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
"b 85f\n"
"80:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[8], [x9]\n"
- "st1 { v8.b }[8], [x26]\n"
- "st1 { v16.b }[8], [x25]\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[8], [x11]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
"b 85f\n"
"81:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x11, #2, 83f\n"
- "str s23, [x9], #0x4\n"
- "str s8, [x26], #0x4\n"
- "str s16, [x25], #0x4\n"
- "tbz x11, #1, 82f\n"
- "st1 { v23.h }[2], [x9], #0x2\n"
- "st1 { v8.h }[2], [x26], #0x2\n"
- "st1 { v16.h }[2], [x25], #0x2\n"
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[6], [x9]\n"
- "st1 { v8.b }[6], [x26]\n"
- "st1 { v16.b }[6], [x25]\n"
+ "tbz x10, #2, 83f\n"
+ "str s23, [x11], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "tbz x10, #1, 82f\n"
+ "st1 { v23.h }[2], [x11], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[6], [x11]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
"b 85f\n"
"82:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[4], [x9]\n"
- "st1 { v8.b }[4], [x26]\n"
- "st1 { v16.b }[4], [x25]\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[4], [x11]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
"b 85f\n"
"83:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x11, #1, 84f\n"
- "str h23, [x9], #0x2\n"
- "str h8, [x26], #0x2\n"
- "str h16, [x25], #0x2\n"
- "tbz x11, #0, 85f\n"
- "st1 { v23.b }[2], [x9]\n"
- "st1 { v8.b }[2], [x26]\n"
- "st1 { v16.b }[2], [x25]\n"
+ "tbz x10, #1, 84f\n"
+ "str h23, [x11], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "tbz x10, #0, 85f\n"
+ "st1 { v23.b }[2], [x11]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
"b 85f\n"
"84:" // Height 3: Partial direct writeback: partial_1_0
- "str b23, [x9, #0x0]\n"
- "str b8, [x26, #0x0]\n"
- "str b16, [x25, #0x0]\n"
+ "str b23, [x11, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
"85:" // Height 3: Partial direct writeback: Done
"b 87f\n"
"86:" // Height 3: Full writeback
- "str q23, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q8, [x26, #0x0]\n"
- "str q16, [x25, #0x0]\n"
+ "str q23, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q8, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
"87:" // Height 3: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 60b\n"
"b 176f\n"
"88:" // Height 4
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"89:" // Height 4: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -1400,8 +1400,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x28, #0x0\n"
"91:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 92f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1429,135 +1429,135 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x20\n"
"ldr q3, [x24, #0x0]\n"
"ldr q4, [x23, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q7, [x9, #0x0]\n"
+ "ldr q6, [x9, #0x10]\n"
"blt 95f\n"
"94:" // Height 4: Multiply loop: Main loop head
"trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v26.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
- ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
- "ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ "add x23, x23, #0x10\n"
+ "ldr q4, [x23, #0x0]\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x80]\n"
+ "ldr q25, [x9, #0x80]\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x90]\n"
+ "ldr q24, [x9, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xa0]\n"
+ "ldr q25, [x9, #0xa0]\n"
".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xb0]\n"
+ "ldr q24, [x9, #0xb0]\n"
".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xc0]\n"
+ "ldr q25, [x9, #0xc0]\n"
".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xd0]\n"
+ "ldr q24, [x9, #0xd0]\n"
".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xe0]\n"
+ "ldr q25, [x9, #0xe0]\n"
".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q24, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
"ldr q1, [x26, #0x0]\n"
".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n"
"ldr q3, [x24, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
+ "ldr q6, [x9, #0x10]\n"
"bge 94b\n"
"95:" // Height 4: Multiply loop: Single iteration only
"trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v26.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "sub x27, x27, #0x10\n"
- ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
- ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
+ "ldr q24, [x9, #0x70]\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x80]\n"
+ "ldr q25, [x9, #0x80]\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x90]\n"
+ "ldr q24, [x9, #0x90]\n"
".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xa0]\n"
+ "ldr q25, [x9, #0xa0]\n"
".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xb0]\n"
+ "ldr q24, [x9, #0xb0]\n"
".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xc0]\n"
+ "ldr q25, [x9, #0xc0]\n"
".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xd0]\n"
+ "ldr q24, [x9, #0xd0]\n"
".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xe0]\n"
+ "ldr q25, [x9, #0xe0]\n"
".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q24, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
@@ -1567,35 +1567,35 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 98f\n"
"97:" // Height 4: Multiply loop: Odd block loop
- "ldr d29, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "trn1 v27.2d, v25.2d, v24.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d28, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "trn1 v26.2d, v25.2d, v24.2d\n"
"cmp x27, #0x8\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q24, [x10, #0x10]\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
- "trn1 v26.2d, v28.2d, v26.2d\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n"
".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q24, [x9, #0x70]\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
@@ -1644,29 +1644,29 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b3, [x24, #0x0]\n"
"ldr b4, [x23, #0x0]\n"
"102:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q25, [x10, #0x0]\n"
- "ldr q24, [x10, #0x10]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
"trn1 v27.2d, v1.2d, v2.2d\n"
"trn1 v26.2d, v3.2d, v4.2d\n"
".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n"
".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x40]\n"
+ "ldr q25, [x9, #0x40]\n"
".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x50]\n"
+ "ldr q24, [x9, #0x50]\n"
".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
- "ldr q25, [x10, #0x60]\n"
+ "ldr q25, [x9, #0x60]\n"
".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
- "ldr q24, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q24, [x9, #0x70]\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
@@ -1687,29 +1687,29 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
+ "add x25, x11, x20\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x14, x14, #0x40\n"
+ "add x24, x25, x20\n"
+ "add x23, x24, x20\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "add x26, x9, x20\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "add x14, x14, #0x40\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "add x24, x25, x20\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"mov v23.16b, v26.16b\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "add v23.4s, v23.4s, v28.4s\n"
"add v12.4s, v12.4s, v27.4s\n"
"add v13.4s, v13.4s, v25.4s\n"
"add v14.4s, v14.4s, v24.4s\n"
- "add v23.4s, v23.4s, v28.4s\n"
"add v8.4s, v8.4s, v28.4s\n"
"add v9.4s, v9.4s, v27.4s\n"
"add v10.4s, v10.4s, v25.4s\n"
@@ -1735,9 +1735,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 105f\n"
"104:" // Height 4: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -1772,11 +1772,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v25.4s, v25.4s, #0x1f\n"
"sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v23.4s, v23.4s, v27.4s\n"
- "and v27.16b, v8.16b, v0.16b\n"
"sqadd v12.4s, v12.4s, v26.4s\n"
- "and v26.16b, v9.16b, v1.16b\n"
"sqadd v13.4s, v13.4s, v25.4s\n"
"sqadd v14.4s, v14.4s, v24.4s\n"
+ "and v27.16b, v8.16b, v0.16b\n"
+ "and v26.16b, v9.16b, v1.16b\n"
"and v25.16b, v10.16b, v2.16b\n"
"and v24.16b, v11.16b, v3.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
@@ -1784,11 +1784,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v25.4s, v25.4s, #0x1f\n"
"sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v27.4s\n"
- "and v27.16b, v15.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v26.4s\n"
- "and v26.16b, v20.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v25.4s\n"
"sqadd v11.4s, v11.4s, v24.4s\n"
+ "and v27.16b, v15.16b, v0.16b\n"
+ "and v26.16b, v20.16b, v1.16b\n"
"and v25.16b, v21.16b, v2.16b\n"
"and v24.16b, v22.16b, v3.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
@@ -1796,11 +1796,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v25.4s, v25.4s, #0x1f\n"
"sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v15.4s, v15.4s, v27.4s\n"
- "and v27.16b, v16.16b, v0.16b\n"
"sqadd v20.4s, v20.4s, v26.4s\n"
- "and v26.16b, v17.16b, v1.16b\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v1.16b\n"
"and v25.16b, v18.16b, v2.16b\n"
"and v24.16b, v19.16b, v3.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
@@ -1812,21 +1812,21 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqadd v18.4s, v18.4s, v25.4s\n"
"sqadd v19.4s, v19.4s, v24.4s\n"
"106:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v1.4s\n"
"srshl v21.4s, v21.4s, v2.4s\n"
@@ -1896,113 +1896,113 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v15.16b, v15.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 115f\n"
- "tbz x11, #3, 110f\n"
- "str d23, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "tbz x11, #2, 108f\n"
- "st1 { v23.s }[2], [x9], #0x4\n"
- "st1 { v8.s }[2], [x26], #0x4\n"
- "st1 { v15.s }[2], [x25], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "tbz x11, #1, 107f\n"
- "st1 { v23.h }[6], [x9], #0x2\n"
- "st1 { v8.h }[6], [x26], #0x2\n"
- "st1 { v15.h }[6], [x25], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[14], [x9]\n"
- "st1 { v8.b }[14], [x26]\n"
- "st1 { v15.b }[14], [x25]\n"
- "st1 { v16.b }[14], [x24]\n"
+ "tbz x10, #3, 110f\n"
+ "str d23, [x11], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "tbz x10, #2, 108f\n"
+ "st1 { v23.s }[2], [x11], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "tbz x10, #1, 107f\n"
+ "st1 { v23.h }[6], [x11], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v15.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[14], [x11]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v15.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
"b 114f\n"
"107:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[12], [x9]\n"
- "st1 { v8.b }[12], [x26]\n"
- "st1 { v15.b }[12], [x25]\n"
- "st1 { v16.b }[12], [x24]\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[12], [x11]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v15.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
"b 114f\n"
"108:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x11, #1, 109f\n"
- "st1 { v23.h }[4], [x9], #0x2\n"
- "st1 { v8.h }[4], [x26], #0x2\n"
- "st1 { v15.h }[4], [x25], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[10], [x9]\n"
- "st1 { v8.b }[10], [x26]\n"
- "st1 { v15.b }[10], [x25]\n"
- "st1 { v16.b }[10], [x24]\n"
+ "tbz x10, #1, 109f\n"
+ "st1 { v23.h }[4], [x11], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v15.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[10], [x11]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v15.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
"b 114f\n"
"109:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[8], [x9]\n"
- "st1 { v8.b }[8], [x26]\n"
- "st1 { v15.b }[8], [x25]\n"
- "st1 { v16.b }[8], [x24]\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[8], [x11]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v15.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
"b 114f\n"
"110:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x11, #2, 112f\n"
- "str s23, [x9], #0x4\n"
- "str s8, [x26], #0x4\n"
- "str s15, [x25], #0x4\n"
- "str s16, [x24], #0x4\n"
- "tbz x11, #1, 111f\n"
- "st1 { v23.h }[2], [x9], #0x2\n"
- "st1 { v8.h }[2], [x26], #0x2\n"
- "st1 { v15.h }[2], [x25], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[6], [x9]\n"
- "st1 { v8.b }[6], [x26]\n"
- "st1 { v15.b }[6], [x25]\n"
- "st1 { v16.b }[6], [x24]\n"
+ "tbz x10, #2, 112f\n"
+ "str s23, [x11], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "tbz x10, #1, 111f\n"
+ "st1 { v23.h }[2], [x11], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v15.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[6], [x11]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v15.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
"b 114f\n"
"111:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[4], [x9]\n"
- "st1 { v8.b }[4], [x26]\n"
- "st1 { v15.b }[4], [x25]\n"
- "st1 { v16.b }[4], [x24]\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[4], [x11]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v15.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
"b 114f\n"
"112:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x11, #1, 113f\n"
- "str h23, [x9], #0x2\n"
- "str h8, [x26], #0x2\n"
- "str h15, [x25], #0x2\n"
- "str h16, [x24], #0x2\n"
- "tbz x11, #0, 114f\n"
- "st1 { v23.b }[2], [x9]\n"
- "st1 { v8.b }[2], [x26]\n"
- "st1 { v15.b }[2], [x25]\n"
- "st1 { v16.b }[2], [x24]\n"
+ "tbz x10, #1, 113f\n"
+ "str h23, [x11], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h15, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "tbz x10, #0, 114f\n"
+ "st1 { v23.b }[2], [x11]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v15.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
"b 114f\n"
"113:" // Height 4: Partial direct writeback: partial_1_0
- "str b23, [x9, #0x0]\n"
- "str b8, [x26, #0x0]\n"
- "str b15, [x25, #0x0]\n"
- "str b16, [x24, #0x0]\n"
+ "str b23, [x11, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b15, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
"114:" // Height 4: Partial direct writeback: Done
"b 116f\n"
"115:" // Height 4: Full writeback
- "str q23, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q8, [x26, #0x0]\n"
- "str q15, [x25, #0x0]\n"
- "str q16, [x24, #0x0]\n"
+ "str q23, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q8, [x25, #0x0]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
"116:" // Height 4: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 89b\n"
"b 176f\n"
"117:" // Height 5
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"118:" // Height 5: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -2032,8 +2032,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x28, #0x0\n"
"120:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 121f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2065,91 +2065,91 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q3, [x24, #0x0]\n"
"ldr q4, [x23, #0x0]\n"
"ldr q5, [x22, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
"blt 124f\n"
"123:" // Height 5: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "sub x27, x27, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x9, #0x10]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
+ "ldr q7, [x9, #0x20]\n"
".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
+ "ldr q6, [x9, #0x30]\n"
".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
+ "ldr q7, [x9, #0x40]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
+ "ldr q6, [x9, #0x50]\n"
".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
+ "ldr q7, [x9, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q6, [x9, #0x70]\n"
".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
+ "ldr q7, [x9, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
"ldr q2, [x25, #0x0]\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q0, [x10, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q6, [x10, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xb0]\n"
+ "ldr q0, [x9, #0xb0]\n"
".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n"
".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xd0]\n"
+ "ldr q0, [x9, #0xd0]\n"
".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n"
".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q0, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n"
".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
@@ -2160,79 +2160,79 @@ void a64_hybrid_s8qs_mmla_6x16 (
"124:" // Height 5: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x9, #0x10]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
+ "ldr q7, [x9, #0x20]\n"
".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
+ "ldr q6, [x9, #0x30]\n"
".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
+ "ldr q7, [x9, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ "add x22, x22, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
+ "ldr q6, [x9, #0x50]\n"
".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
+ "ldr q7, [x9, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q6, [x9, #0x70]\n"
".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
+ "ldr q7, [x9, #0x80]\n"
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q0, [x10, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q2, [x10, #0xa0]\n"
+ "ldr q2, [x9, #0xa0]\n"
".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xb0]\n"
+ "ldr q0, [x9, #0xb0]\n"
".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n"
".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n"
".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xc0]\n"
+ "ldr q2, [x9, #0xc0]\n"
".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xd0]\n"
+ "ldr q0, [x9, #0xd0]\n"
".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n"
".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n"
".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xe0]\n"
+ "ldr q2, [x9, #0xe0]\n"
".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q0, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n"
".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n"
".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n"
@@ -2244,44 +2244,44 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 127f\n"
"126:" // Height 5: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "cmp x27, #0x8\n"
"ldr d0, [x22], #0x8\n"
- "ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v3.2d, v2.2d\n"
- "trn1 v2.2d, v0.2d, v5.2d\n"
- "ldr q0, [x10, #0x10]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
+ "ldr q0, [x9, #0x10]\n"
".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x20]\n"
+ "ldr q1, [x9, #0x20]\n"
".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ "cmp x27, #0x8\n"
".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
+ "ldr q0, [x9, #0x30]\n"
".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x40]\n"
+ "ldr q1, [x9, #0x40]\n"
".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
+ "ldr q0, [x9, #0x50]\n"
".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n"
".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x60]\n"
+ "ldr q1, [x9, #0x60]\n"
".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q0, [x9, #0x70]\n"
".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n"
".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n"
@@ -2338,36 +2338,36 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b4, [x23, #0x0]\n"
"ldr b5, [x22, #0x0]\n"
"131:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn1 v3.2d, v3.2d, v4.2d\n"
"trn1 v2.2d, v5.2d, v0.2d\n"
- "ldr q0, [x10, #0x10]\n"
+ "ldr q0, [x9, #0x10]\n"
".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
- "ldr q1, [x10, #0x20]\n"
+ "ldr q1, [x9, #0x20]\n"
".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
+ "ldr q0, [x9, #0x30]\n"
".inst 0x4e81a4c9 // smmla v9.4s, v6.16b, v1.16b\n"
".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x40]\n"
+ "ldr q1, [x9, #0x40]\n"
".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
+ "ldr q0, [x9, #0x50]\n"
".inst 0x4e81a4ca // smmla v10.4s, v6.16b, v1.16b\n"
".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x60]\n"
+ "ldr q1, [x9, #0x60]\n"
".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q0, [x9, #0x70]\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e81a4cb // smmla v11.4s, v6.16b, v1.16b\n"
".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n"
@@ -2388,37 +2388,37 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x11, x20\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x14, x14, #0x40\n"
+ "add x24, x25, x20\n"
+ "add x23, x24, x20\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "add x26, x9, x20\n"
+ "add x22, x23, x20\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "add x25, x26, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x25, x20\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "add x14, x14, #0x40\n"
"uzp1 v24.2d, v24.2d, v28.2d\n"
"uzp1 v25.2d, v25.2d, v29.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
"mov v31.16b, v2.16b\n"
+ "add v31.4s, v31.4s, v4.4s\n"
"add v12.4s, v12.4s, v3.4s\n"
"add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v0.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
"add v8.4s, v8.4s, v4.4s\n"
"add v9.4s, v9.4s, v3.4s\n"
"add v10.4s, v10.4s, v1.4s\n"
@@ -2448,9 +2448,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 134f\n"
"133:" // Height 5: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -2489,11 +2489,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v23.4s, v23.4s, #0x1f\n"
"sqadd v31.4s, v31.4s, v30.4s\n"
- "and v30.16b, v8.16b, v0.16b\n"
"sqadd v12.4s, v12.4s, v29.4s\n"
- "and v29.16b, v9.16b, v1.16b\n"
"sqadd v13.4s, v13.4s, v28.4s\n"
"sqadd v14.4s, v14.4s, v23.4s\n"
+ "and v30.16b, v8.16b, v0.16b\n"
+ "and v29.16b, v9.16b, v1.16b\n"
"and v28.16b, v10.16b, v2.16b\n"
"and v23.16b, v11.16b, v3.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
@@ -2501,11 +2501,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v23.4s, v23.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v30.4s\n"
- "and v30.16b, v15.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v29.4s\n"
- "and v29.16b, v20.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v28.4s\n"
"sqadd v11.4s, v11.4s, v23.4s\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "and v29.16b, v20.16b, v1.16b\n"
"and v28.16b, v21.16b, v2.16b\n"
"and v23.16b, v22.16b, v3.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
@@ -2513,11 +2513,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v23.4s, v23.4s, #0x1f\n"
"sqadd v15.4s, v15.4s, v30.4s\n"
- "and v30.16b, v16.16b, v0.16b\n"
"sqadd v20.4s, v20.4s, v29.4s\n"
- "and v29.16b, v17.16b, v1.16b\n"
"sqadd v21.4s, v21.4s, v28.4s\n"
"sqadd v22.4s, v22.4s, v23.4s\n"
+ "and v30.16b, v16.16b, v0.16b\n"
+ "and v29.16b, v17.16b, v1.16b\n"
"and v28.16b, v18.16b, v2.16b\n"
"and v23.16b, v19.16b, v3.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
@@ -2525,11 +2525,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v23.4s, v23.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v30.4s\n"
- "and v30.16b, v24.16b, v0.16b\n"
"sqadd v17.4s, v17.4s, v29.4s\n"
- "and v29.16b, v25.16b, v1.16b\n"
"sqadd v18.4s, v18.4s, v28.4s\n"
"sqadd v19.4s, v19.4s, v23.4s\n"
+ "and v30.16b, v24.16b, v0.16b\n"
+ "and v29.16b, v25.16b, v1.16b\n"
"and v28.16b, v26.16b, v2.16b\n"
"and v23.16b, v27.16b, v3.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
@@ -2541,21 +2541,21 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqadd v26.4s, v26.4s, v28.4s\n"
"sqadd v27.4s, v27.4s, v23.4s\n"
"135:" // Height 5: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "ld1r { v28.4s }, [x20]\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v23.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v1.4s\n"
"srshl v21.4s, v21.4s, v2.4s\n"
@@ -2644,133 +2644,132 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v16.16b, v16.16b, v18.16b\n"
"uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 144f\n"
- "tbz x11, #3, 139f\n"
- "str d31, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x11, #2, 137f\n"
- "st1 { v31.s }[2], [x9], #0x4\n"
- "st1 { v8.s }[2], [x26], #0x4\n"
- "st1 { v15.s }[2], [x25], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x11, #1, 136f\n"
- "st1 { v31.h }[6], [x9], #0x2\n"
- "st1 { v8.h }[6], [x26], #0x2\n"
- "st1 { v15.h }[6], [x25], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[14], [x9]\n"
- "st1 { v8.b }[14], [x26]\n"
- "st1 { v15.b }[14], [x25]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "tbz x10, #3, 139f\n"
+ "str d31, [x11], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x10, #2, 137f\n"
+ "st1 { v31.s }[2], [x11], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x10, #1, 136f\n"
+ "st1 { v31.h }[6], [x11], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v15.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[14], [x11]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v15.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 143f\n"
"136:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[12], [x9]\n"
- "st1 { v8.b }[12], [x26]\n"
- "st1 { v15.b }[12], [x25]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[12], [x11]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v15.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 143f\n"
"137:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x11, #1, 138f\n"
- "st1 { v31.h }[4], [x9], #0x2\n"
- "st1 { v8.h }[4], [x26], #0x2\n"
- "st1 { v15.h }[4], [x25], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[10], [x9]\n"
- "st1 { v8.b }[10], [x26]\n"
- "st1 { v15.b }[10], [x25]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "tbz x10, #1, 138f\n"
+ "st1 { v31.h }[4], [x11], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v15.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[10], [x11]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v15.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 143f\n"
"138:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[8], [x9]\n"
- "st1 { v8.b }[8], [x26]\n"
- "st1 { v15.b }[8], [x25]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[8], [x11]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v15.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 143f\n"
"139:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x11, #2, 141f\n"
- "str s31, [x9], #0x4\n"
- "str s8, [x26], #0x4\n"
- "str s15, [x25], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x11, #1, 140f\n"
- "st1 { v31.h }[2], [x9], #0x2\n"
- "st1 { v8.h }[2], [x26], #0x2\n"
- "st1 { v15.h }[2], [x25], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[6], [x9]\n"
- "st1 { v8.b }[6], [x26]\n"
- "st1 { v15.b }[6], [x25]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "tbz x10, #2, 141f\n"
+ "str s31, [x11], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x10, #1, 140f\n"
+ "st1 { v31.h }[2], [x11], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v15.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[6], [x11]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v15.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 143f\n"
"140:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[4], [x9]\n"
- "st1 { v8.b }[4], [x26]\n"
- "st1 { v15.b }[4], [x25]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[4], [x11]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v15.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 143f\n"
"141:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x11, #1, 142f\n"
- "str h31, [x9], #0x2\n"
- "str h8, [x26], #0x2\n"
- "str h15, [x25], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x11, #0, 143f\n"
- "st1 { v31.b }[2], [x9]\n"
- "st1 { v8.b }[2], [x26]\n"
- "st1 { v15.b }[2], [x25]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "tbz x10, #1, 142f\n"
+ "str h31, [x11], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h15, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x10, #0, 143f\n"
+ "st1 { v31.b }[2], [x11]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v15.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 143f\n"
"142:" // Height 5: Partial direct writeback: partial_1_0
- "str b31, [x9, #0x0]\n"
- "str b8, [x26, #0x0]\n"
- "str b15, [x25, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b31, [x11, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b15, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"143:" // Height 5: Partial direct writeback: Done
"b 145f\n"
"144:" // Height 5: Full writeback
- "str q31, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q8, [x26, #0x0]\n"
- "str q15, [x25, #0x0]\n"
- "str q16, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q31, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q8, [x25, #0x0]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"145:" // Height 5: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 118b\n"
"b 176f\n"
"146:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x6\n"
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"147:" // Height 6: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
@@ -2800,8 +2799,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"mov x28, #0x0\n"
"149:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 150f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2837,93 +2836,93 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q4, [x23, #0x0]\n"
"ldr q5, [x22, #0x0]\n"
"ldr q6, [x21, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
"blt 153f\n"
"152:" // Height 6: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x9, #0x10]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
+ "ldr q7, [x9, #0x20]\n"
".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q6, [x9, #0x30]\n"
".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
+ "ldr q7, [x9, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
+ "ldr q6, [x9, #0x50]\n"
".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
+ "ldr q7, [x9, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q6, [x9, #0x70]\n"
".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
+ "ldr q7, [x9, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
"ldr q2, [x25, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q0, [x10, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q6, [x10, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xb0]\n"
+ "ldr q0, [x9, #0xb0]\n"
".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n"
".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xd0]\n"
+ "ldr q0, [x9, #0xd0]\n"
".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n"
".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q0, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n"
".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n"
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
@@ -2935,81 +2934,81 @@ void a64_hybrid_s8qs_mmla_6x16 (
"153:" // Height 6: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "add x25, x25, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x9, #0x10]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
+ "ldr q7, [x9, #0x20]\n"
".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q6, [x9, #0x30]\n"
".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
+ "ldr q7, [x9, #0x40]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ "add x21, x21, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
+ "ldr q6, [x9, #0x50]\n"
".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
+ "ldr q7, [x9, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q6, [x9, #0x70]\n"
".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
+ "ldr q7, [x9, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q0, [x10, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q2, [x10, #0xa0]\n"
+ "ldr q2, [x9, #0xa0]\n"
".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xb0]\n"
+ "ldr q0, [x9, #0xb0]\n"
".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n"
".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n"
".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xc0]\n"
+ "ldr q2, [x9, #0xc0]\n"
".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xd0]\n"
+ "ldr q0, [x9, #0xd0]\n"
".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n"
".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n"
".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xe0]\n"
+ "ldr q2, [x9, #0xe0]\n"
".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "ldr q0, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n"
".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n"
".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n"
@@ -3021,44 +3020,44 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 156f\n"
"155:" // Height 6: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d5, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x8\n"
- "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x22], #0x8\n"
"ldr d0, [x21], #0x8\n"
- "ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v3.2d\n"
- "trn1 v2.2d, v2.2d, v0.2d\n"
- "ldr q0, [x10, #0x10]\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
+ "ldr q1, [x9, #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x20]\n"
+ "ldr q1, [x9, #0x20]\n"
".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
+ "ldr q0, [x9, #0x30]\n"
".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x40]\n"
+ "ldr q1, [x9, #0x40]\n"
".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n"
".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
+ "ldr q0, [x9, #0x50]\n"
".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n"
".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x60]\n"
+ "ldr q1, [x9, #0x60]\n"
".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n"
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q0, [x9, #0x70]\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n"
".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n"
@@ -3123,37 +3122,37 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b5, [x22, #0x0]\n"
"ldr b6, [x21, #0x0]\n"
"160:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
+ "ldr q7, [x9, #0x0]\n"
"trn1 v2.2d, v1.2d, v2.2d\n"
"trn1 v4.2d, v3.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v6.2d\n"
- "ldr q0, [x10, #0x10]\n"
".inst 0x4e87a448 // smmla v8.4s, v2.16b, v7.16b\n"
+ "trn1 v3.2d, v5.2d, v6.2d\n"
+ "ldr q0, [x9, #0x10]\n"
".inst 0x4e87a490 // smmla v16.4s, v4.16b, v7.16b\n"
".inst 0x4e87a478 // smmla v24.4s, v3.16b, v7.16b\n"
- "ldr q1, [x10, #0x20]\n"
+ "ldr q1, [x9, #0x20]\n"
".inst 0x4e80a44c // smmla v12.4s, v2.16b, v0.16b\n"
".inst 0x4e80a494 // smmla v20.4s, v4.16b, v0.16b\n"
".inst 0x4e80a47c // smmla v28.4s, v3.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
+ "ldr q0, [x9, #0x30]\n"
".inst 0x4e81a449 // smmla v9.4s, v2.16b, v1.16b\n"
".inst 0x4e81a491 // smmla v17.4s, v4.16b, v1.16b\n"
".inst 0x4e81a479 // smmla v25.4s, v3.16b, v1.16b\n"
- "ldr q1, [x10, #0x40]\n"
+ "ldr q1, [x9, #0x40]\n"
".inst 0x4e80a44d // smmla v13.4s, v2.16b, v0.16b\n"
".inst 0x4e80a495 // smmla v21.4s, v4.16b, v0.16b\n"
".inst 0x4e80a47d // smmla v29.4s, v3.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
+ "ldr q0, [x9, #0x50]\n"
".inst 0x4e81a44a // smmla v10.4s, v2.16b, v1.16b\n"
".inst 0x4e81a492 // smmla v18.4s, v4.16b, v1.16b\n"
".inst 0x4e81a47a // smmla v26.4s, v3.16b, v1.16b\n"
- "ldr q1, [x10, #0x60]\n"
+ "ldr q1, [x9, #0x60]\n"
".inst 0x4e80a44e // smmla v14.4s, v2.16b, v0.16b\n"
".inst 0x4e80a496 // smmla v22.4s, v4.16b, v0.16b\n"
".inst 0x4e80a47e // smmla v30.4s, v3.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "ldr q0, [x9, #0x70]\n"
".inst 0x4e81a44b // smmla v11.4s, v2.16b, v1.16b\n"
+ "add x9, x9, #0x80\n"
".inst 0x4e81a493 // smmla v19.4s, v4.16b, v1.16b\n"
".inst 0x4e81a47b // smmla v27.4s, v3.16b, v1.16b\n"
".inst 0x4e80a44f // smmla v15.4s, v2.16b, v0.16b\n"
@@ -3173,32 +3172,32 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "add x23, x24, x20\n"
+ "add x22, x23, x20\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x14, x14, #0x40\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "add x24, x25, x20\n"
- "prfm pstl1keep, [x26, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x14, x14, #0x40\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "add x22, x23, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
"uzp2 v24.2d, v24.2d, v28.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v28.2d, v25.2d, v29.2d\n"
"uzp2 v25.2d, v25.2d, v29.2d\n"
"uzp1 v29.2d, v26.2d, v30.2d\n"
@@ -3206,10 +3205,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
"mov v31.16b, v2.16b\n"
+ "add v31.4s, v31.4s, v4.4s\n"
"add v12.4s, v12.4s, v3.4s\n"
"add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v0.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
"add v8.4s, v8.4s, v4.4s\n"
"add v9.4s, v9.4s, v3.4s\n"
"add v10.4s, v10.4s, v1.4s\n"
@@ -3243,9 +3242,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 163f\n"
"162:" // Height 6: per layer parameters
- "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1r { v0.4s }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
@@ -3288,11 +3287,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v31.4s, v31.4s, v7.4s\n"
- "and v7.16b, v8.16b, v0.16b\n"
"sqadd v12.4s, v12.4s, v6.4s\n"
- "and v6.16b, v9.16b, v1.16b\n"
"sqadd v13.4s, v13.4s, v5.4s\n"
"sqadd v14.4s, v14.4s, v4.4s\n"
+ "and v7.16b, v8.16b, v0.16b\n"
+ "and v6.16b, v9.16b, v1.16b\n"
"and v5.16b, v10.16b, v2.16b\n"
"and v4.16b, v11.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3300,11 +3299,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v8.4s, v8.4s, v7.4s\n"
- "and v7.16b, v15.16b, v0.16b\n"
"sqadd v9.4s, v9.4s, v6.4s\n"
- "and v6.16b, v20.16b, v1.16b\n"
"sqadd v10.4s, v10.4s, v5.4s\n"
"sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v0.16b\n"
+ "and v6.16b, v20.16b, v1.16b\n"
"and v5.16b, v21.16b, v2.16b\n"
"and v4.16b, v22.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3312,11 +3311,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v15.4s, v15.4s, v7.4s\n"
- "and v7.16b, v16.16b, v0.16b\n"
"sqadd v20.4s, v20.4s, v6.4s\n"
- "and v6.16b, v17.16b, v1.16b\n"
"sqadd v21.4s, v21.4s, v5.4s\n"
"sqadd v22.4s, v22.4s, v4.4s\n"
+ "and v7.16b, v16.16b, v0.16b\n"
+ "and v6.16b, v17.16b, v1.16b\n"
"and v5.16b, v18.16b, v2.16b\n"
"and v4.16b, v19.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3324,11 +3323,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v7.4s\n"
- "and v7.16b, v23.16b, v0.16b\n"
"sqadd v17.4s, v17.4s, v6.4s\n"
- "and v6.16b, v28.16b, v1.16b\n"
"sqadd v18.4s, v18.4s, v5.4s\n"
"sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v0.16b\n"
+ "and v6.16b, v28.16b, v1.16b\n"
"and v5.16b, v29.16b, v2.16b\n"
"and v4.16b, v30.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3336,11 +3335,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sqadd v23.4s, v23.4s, v7.4s\n"
- "and v7.16b, v24.16b, v0.16b\n"
"sqadd v28.4s, v28.4s, v6.4s\n"
- "and v6.16b, v25.16b, v1.16b\n"
"sqadd v29.4s, v29.4s, v5.4s\n"
"sqadd v30.4s, v30.4s, v4.4s\n"
+ "and v7.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v1.16b\n"
"and v5.16b, v26.16b, v2.16b\n"
"and v4.16b, v27.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
@@ -3352,21 +3351,21 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqadd v26.4s, v26.4s, v5.4s\n"
"sqadd v27.4s, v27.4s, v4.4s\n"
"164:" // Height 6: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v6.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x21]\n"
- "ld1r { v5.4s }, [x20]\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x11, #0x10\n"
"ld1r { v4.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
+ "cmp x10, #0x10\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v1.4s\n"
"srshl v21.4s, v21.4s, v2.4s\n"
@@ -3474,136 +3473,136 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v23.16b, v23.16b, v18.16b\n"
"uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 173f\n"
- "tbz x11, #3, 168f\n"
- "str d31, [x9], #0x8\n"
- "str d8, [x26], #0x8\n"
- "str d15, [x25], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
- "tbz x11, #2, 166f\n"
- "st1 { v31.s }[2], [x9], #0x4\n"
- "st1 { v8.s }[2], [x26], #0x4\n"
- "st1 { v15.s }[2], [x25], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v23.s }[2], [x23], #0x4\n"
- "st1 { v24.s }[2], [x22], #0x4\n"
- "tbz x11, #1, 165f\n"
- "st1 { v31.h }[6], [x9], #0x2\n"
- "st1 { v8.h }[6], [x26], #0x2\n"
- "st1 { v15.h }[6], [x25], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v23.h }[6], [x23], #0x2\n"
- "st1 { v24.h }[6], [x22], #0x2\n"
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[14], [x9]\n"
- "st1 { v8.b }[14], [x26]\n"
- "st1 { v15.b }[14], [x25]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v23.b }[14], [x23]\n"
- "st1 { v24.b }[14], [x22]\n"
+ "tbz x10, #3, 168f\n"
+ "str d31, [x11], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "tbz x10, #2, 166f\n"
+ "st1 { v31.s }[2], [x11], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
+ "tbz x10, #1, 165f\n"
+ "st1 { v31.h }[6], [x11], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v15.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v23.h }[6], [x22], #0x2\n"
+ "st1 { v24.h }[6], [x21], #0x2\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[14], [x11]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v15.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v23.b }[14], [x22]\n"
+ "st1 { v24.b }[14], [x21]\n"
"b 172f\n"
"165:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[12], [x9]\n"
- "st1 { v8.b }[12], [x26]\n"
- "st1 { v15.b }[12], [x25]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v23.b }[12], [x23]\n"
- "st1 { v24.b }[12], [x22]\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[12], [x11]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v15.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v23.b }[12], [x22]\n"
+ "st1 { v24.b }[12], [x21]\n"
"b 172f\n"
"166:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x11, #1, 167f\n"
- "st1 { v31.h }[4], [x9], #0x2\n"
- "st1 { v8.h }[4], [x26], #0x2\n"
- "st1 { v15.h }[4], [x25], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v23.h }[4], [x23], #0x2\n"
- "st1 { v24.h }[4], [x22], #0x2\n"
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[10], [x9]\n"
- "st1 { v8.b }[10], [x26]\n"
- "st1 { v15.b }[10], [x25]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v23.b }[10], [x23]\n"
- "st1 { v24.b }[10], [x22]\n"
+ "tbz x10, #1, 167f\n"
+ "st1 { v31.h }[4], [x11], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v15.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v23.h }[4], [x22], #0x2\n"
+ "st1 { v24.h }[4], [x21], #0x2\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[10], [x11]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v15.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v23.b }[10], [x22]\n"
+ "st1 { v24.b }[10], [x21]\n"
"b 172f\n"
"167:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[8], [x9]\n"
- "st1 { v8.b }[8], [x26]\n"
- "st1 { v15.b }[8], [x25]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v23.b }[8], [x23]\n"
- "st1 { v24.b }[8], [x22]\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[8], [x11]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v15.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v23.b }[8], [x22]\n"
+ "st1 { v24.b }[8], [x21]\n"
"b 172f\n"
"168:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x11, #2, 170f\n"
- "str s31, [x9], #0x4\n"
- "str s8, [x26], #0x4\n"
- "str s15, [x25], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s23, [x23], #0x4\n"
- "str s24, [x22], #0x4\n"
- "tbz x11, #1, 169f\n"
- "st1 { v31.h }[2], [x9], #0x2\n"
- "st1 { v8.h }[2], [x26], #0x2\n"
- "st1 { v15.h }[2], [x25], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v23.h }[2], [x23], #0x2\n"
- "st1 { v24.h }[2], [x22], #0x2\n"
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[6], [x9]\n"
- "st1 { v8.b }[6], [x26]\n"
- "st1 { v15.b }[6], [x25]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v23.b }[6], [x23]\n"
- "st1 { v24.b }[6], [x22]\n"
+ "tbz x10, #2, 170f\n"
+ "str s31, [x11], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
+ "tbz x10, #1, 169f\n"
+ "st1 { v31.h }[2], [x11], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v15.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v23.h }[2], [x22], #0x2\n"
+ "st1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[6], [x11]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v15.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v23.b }[6], [x22]\n"
+ "st1 { v24.b }[6], [x21]\n"
"b 172f\n"
"169:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[4], [x9]\n"
- "st1 { v8.b }[4], [x26]\n"
- "st1 { v15.b }[4], [x25]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v23.b }[4], [x23]\n"
- "st1 { v24.b }[4], [x22]\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[4], [x11]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v15.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v23.b }[4], [x22]\n"
+ "st1 { v24.b }[4], [x21]\n"
"b 172f\n"
"170:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x11, #1, 171f\n"
- "str h31, [x9], #0x2\n"
- "str h8, [x26], #0x2\n"
- "str h15, [x25], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h23, [x23], #0x2\n"
- "str h24, [x22], #0x2\n"
- "tbz x11, #0, 172f\n"
- "st1 { v31.b }[2], [x9]\n"
- "st1 { v8.b }[2], [x26]\n"
- "st1 { v15.b }[2], [x25]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v23.b }[2], [x23]\n"
- "st1 { v24.b }[2], [x22]\n"
+ "tbz x10, #1, 171f\n"
+ "str h31, [x11], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h15, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h23, [x22], #0x2\n"
+ "str h24, [x21], #0x2\n"
+ "tbz x10, #0, 172f\n"
+ "st1 { v31.b }[2], [x11]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v15.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "st1 { v24.b }[2], [x21]\n"
"b 172f\n"
"171:" // Height 6: Partial direct writeback: partial_1_0
- "str b31, [x9, #0x0]\n"
- "str b8, [x26, #0x0]\n"
- "str b15, [x25, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b23, [x23, #0x0]\n"
- "str b24, [x22, #0x0]\n"
+ "str b31, [x11, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b15, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b23, [x22, #0x0]\n"
+ "str b24, [x21, #0x0]\n"
"172:" // Height 6: Partial direct writeback: Done
"b 174f\n"
"173:" // Height 6: Full writeback
- "str q31, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "str q8, [x26, #0x0]\n"
- "str q15, [x25, #0x0]\n"
- "str q16, [x24, #0x0]\n"
- "str q23, [x23, #0x0]\n"
- "str q24, [x22, #0x0]\n"
+ "str q31, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "str q8, [x25, #0x0]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q24, [x21, #0x0]\n"
"174:" // Height 6: Writeback done
- "subs x11, x11, #0x10\n"
+ "subs x10, x10, #0x10\n"
"bgt 147b\n"
"subs %x[M], %x[M], #0x6\n"
"beq 176f\n"
@@ -3617,8 +3616,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"176:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index 47b6861f5b..e13f2fb5eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
index 55629a38d0..f5545b4357 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
@@ -44,18 +44,18 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -87,72 +87,72 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"bgt 69f\n"
"beq 35f\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"tbz %x[flags], #0, 12f\n"
"cmp x8, #0x10\n"
"bge 11f\n"
"tbz x8, #3, 6f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"tbz x8, #2, 4f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"tbz x8, #1, 3f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"tbz x8, #0, 10f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"b 10f\n"
"3:" // Height 1: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 10f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"b 10f\n"
"4:" // Height 1: Partial accumulate: partial_2_8
"tbz x8, #1, 5f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"tbz x8, #0, 10f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"b 10f\n"
"5:" // Height 1: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 10f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"b 10f\n"
"6:" // Height 1: Partial accumulate: partial_4_0
"tbz x8, #2, 8f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"tbz x8, #1, 7f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"tbz x8, #0, 10f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"b 10f\n"
"7:" // Height 1: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 10f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"b 10f\n"
"8:" // Height 1: Partial accumulate: partial_2_0
"tbz x8, #1, 9f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"tbz x8, #0, 10f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"b 10f\n"
"9:" // Height 1: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"10:" // Height 1: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 13f\n"
"11:" // Height 1: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"b 13f\n"
"12:" // Height 1: no accumulate
"movi v8.4s, #0x0\n"
@@ -163,8 +163,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"mov x15, #0x0\n"
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -180,118 +180,118 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"blt 19f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 18f\n"
"17:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr d17, [x17, #0x20]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr d17, [x16, #0x20]\n"
+ "ldr x20, [x16, #0x28]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr d16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
- "ldr x20, [x17, #0x38]\n"
- "sub x14, x14, #0x10\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "ldr x22, [x13, #0x8]\n"
- "cmp x14, #0x20\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x38]\n"
"mov v16.d[1], x20\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr d17, [x17, #0x40]\n"
+ "ldr d17, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr d16, [x17, #0x50]\n"
- "ldr x20, [x17, #0x58]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr d16, [x16, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
"mov v16.d[1], x20\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr d17, [x17, #0x60]\n"
+ "ldr d17, [x16, #0x60]\n"
+ "ldr x20, [x16, #0x68]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr d16, [x17, #0x70]\n"
- "ldr x20, [x17, #0x78]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
"mov v16.d[1], x20\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr d17, [x17, #0x80]\n"
+ "ldr d17, [x16, #0x80]\n"
+ "ldr x20, [x16, #0x88]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr d16, [x17, #0x90]\n"
- "ldr x20, [x17, #0x98]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
+ "ldr d16, [x16, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
"mov v16.d[1], x20\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr d17, [x17, #0xa0]\n"
+ "ldr d17, [x16, #0xa0]\n"
+ "ldr x20, [x16, #0xa8]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr d16, [x17, #0xb0]\n"
- "ldr x20, [x17, #0xb8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
"mov v16.d[1], x20\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr d17, [x17, #0xc0]\n"
+ "ldr d17, [x16, #0xc0]\n"
+ "ldr x20, [x16, #0xc8]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr d16, [x17, #0xd0]\n"
- "ldr x20, [x17, #0xd8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
+ "ldr d16, [x16, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
"mov v16.d[1], x20\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr d17, [x17, #0xe0]\n"
+ "ldr d17, [x16, #0xe0]\n"
+ "ldr x20, [x16, #0xe8]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr d16, [x17, #0xf0]\n"
- "ldr x20, [x17, #0xf8]\n"
- "add x17, x17, #0x100\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
"mov v16.d[1], x20\n"
+ "add x13, x13, #0x10\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
+ "ldr x20, [x16, #0x8]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "ldr x20, [x17, #0x18]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x22\n"
+ "sub x14, x14, #0x10\n"
+ "ldr d7, [x16, #0x10]\n"
+ "cmp x14, #0x20\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x21\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
- "sub x14, x14, #0x10\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x17, #0x40]\n"
+ "ldr q17, [x16, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x17, #0x50]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr q16, [x16, #0x50]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x17, #0x60]\n"
+ "ldr q17, [x16, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x17, #0x70]\n"
+ "ldr q16, [x16, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x17, #0x80]\n"
+ "ldr q17, [x16, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x17, #0x90]\n"
+ "ldr q16, [x16, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x17, #0xa0]\n"
+ "ldr q17, [x16, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x17, #0xb0]\n"
+ "ldr q16, [x16, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x17, #0xc0]\n"
+ "ldr q17, [x16, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x17, #0xd0]\n"
+ "ldr q16, [x16, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x17, #0xe0]\n"
+ "ldr q17, [x16, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr q16, [x16, #0xf0]\n"
+ "add x13, x13, #0x10\n"
+ "sub x14, x14, #0x10\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ "add x16, x16, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 24f\n"
"cmp x14, #0x4\n"
@@ -299,16 +299,16 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"20:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr q17, [x17, #0x0]\n"
- "cmp x14, #0x4\n"
- "ldr q16, [x17, #0x10]\n"
- ".inst 0x4f92e228 // sdot v8.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
- "add x17, x17, #0x40\n"
+ "ldr q17, [x16, #0x20]\n"
+ "cmp x14, #0x4\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
+ "add x16, x16, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
"cbz x14, 24f\n"
@@ -320,165 +320,165 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
- ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x4f80e208 // sdot v8.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
- "add x17, x17, #0x40\n"
- ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x20]\n"
+ ".inst 0x4f80e20a // sdot v10.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "add x16, x16, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
"cmp x15, x20\n"
"bne 14b\n"
"cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"bge 33f\n"
"tbz x8, #3, 28f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"tbz x8, #2, 26f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"tbz x8, #1, 25f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"b 32f\n"
"25:" // Height 1: Partial direct writeback: partial_1_12
"tbz x8, #0, 32f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"b 32f\n"
"26:" // Height 1: Partial direct writeback: partial_2_8
"tbz x8, #1, 27f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"b 32f\n"
"27:" // Height 1: Partial direct writeback: partial_1_8
"tbz x8, #0, 32f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"b 32f\n"
"28:" // Height 1: Partial direct writeback: partial_4_0
"tbz x8, #2, 30f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"tbz x8, #1, 29f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"b 32f\n"
"29:" // Height 1: Partial direct writeback: partial_1_4
"tbz x8, #0, 32f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"b 32f\n"
"30:" // Height 1: Partial direct writeback: partial_2_0
"tbz x8, #1, 31f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"b 32f\n"
"31:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"32:" // Height 1: Partial direct writeback: Done
"b 34f\n"
"33:" // Height 1: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"34:" // Height 1: Writeback done
"subs x8, x8, #0x10\n"
"bgt 2b\n"
"b 206f\n"
"35:" // Height 2
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"36:" // Height 2: Column loop
"tbz %x[flags], #0, 46f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"bge 45f\n"
"tbz x8, #3, 40f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"tbz x8, #2, 38f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"tbz x8, #1, 37f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"b 44f\n"
"37:" // Height 2: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 44f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"b 44f\n"
"38:" // Height 2: Partial accumulate: partial_2_8
"tbz x8, #1, 39f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"b 44f\n"
"39:" // Height 2: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 44f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"b 44f\n"
"40:" // Height 2: Partial accumulate: partial_4_0
"tbz x8, #2, 42f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"tbz x8, #1, 41f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"b 44f\n"
"41:" // Height 2: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 44f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"b 44f\n"
"42:" // Height 2: Partial accumulate: partial_2_0
"tbz x8, #1, 43f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"b 44f\n"
"43:" // Height 2: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"44:" // Height 2: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 47f\n"
"45:" // Height 2: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -497,8 +497,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"mov x15, #0x0\n"
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -518,154 +518,154 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
"ldr q1, [x12, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 52f\n"
"51:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr d17, [x17, #0x20]\n"
+ "ldr d17, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr d16, [x17, #0x30]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x48]\n"
- "add x13, x13, #0x10\n"
- "add x12, x12, #0x10\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x21\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr d17, [x17, #0x40]\n"
+ "ldr d17, [x16, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "ldr x20, [x16, #0x48]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr d16, [x17, #0x50]\n"
+ "ldr d16, [x16, #0x50]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x68]\n"
- "ldr x23, [x13, #0x8]\n"
- "sub x14, x14, #0x10\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr d17, [x17, #0x60]\n"
+ "ldr d17, [x16, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr d16, [x17, #0x70]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x88]\n"
- "ldr x22, [x12, #0x8]\n"
- "cmp x14, #0x20\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x21\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr d17, [x17, #0x80]\n"
+ "ldr d17, [x16, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "ldr x20, [x16, #0x88]\n"
".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr d16, [x17, #0x90]\n"
+ "ldr d16, [x16, #0x90]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xa8]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x16, #0x98]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr d17, [x17, #0xa0]\n"
+ "ldr d17, [x16, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr d16, [x17, #0xb0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xc8]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x21\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr d17, [x17, #0xc0]\n"
+ "ldr d17, [x16, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "ldr x20, [x16, #0xc8]\n"
".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr d16, [x17, #0xd0]\n"
+ "ldr d16, [x16, #0xd0]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xe8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x16, #0xd8]\n"
+ "mov v16.d[1], x20\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr d17, [x17, #0xe0]\n"
+ "ldr d17, [x16, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr d16, [x17, #0xf0]\n"
- "mov v17.d[1], x20\n"
- "add x17, x17, #0x100\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x21\n"
+ "add x13, x13, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"ldr d1, [x12, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x17, #0x18]\n"
- "mov v0.d[1], x23\n"
- "mov v1.d[1], x22\n"
+ "sub x14, x14, #0x10\n"
+ "ldr d7, [x16, #0x10]\n"
+ "cmp x14, #0x20\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v1.d[1], x21\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x17, #0x40]\n"
+ "ldr q17, [x16, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x17, #0x50]\n"
+ "ldr q16, [x16, #0x50]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x17, #0x60]\n"
+ "ldr q17, [x16, #0x60]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x17, #0x70]\n"
+ "ldr q16, [x16, #0x70]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x17, #0x80]\n"
+ "ldr q17, [x16, #0x80]\n"
".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x17, #0x90]\n"
+ "ldr q16, [x16, #0x90]\n"
".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x17, #0xa0]\n"
+ "ldr q17, [x16, #0xa0]\n"
".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x17, #0xb0]\n"
+ "ldr q16, [x16, #0xb0]\n"
".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x17, #0xc0]\n"
+ "ldr q17, [x16, #0xc0]\n"
".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x17, #0xd0]\n"
+ "ldr q16, [x16, #0xd0]\n"
".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x17, #0xe0]\n"
+ "ldr q17, [x16, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x17, #0xf0]\n"
+ "ldr q16, [x16, #0xf0]\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
@@ -678,16 +678,16 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"sub x14, x14, #0x4\n"
"ldr s18, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
+ "ldr q17, [x16, #0x0]\n"
".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
@@ -705,16 +705,16 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr b0, [x13, #0x0]\n"
"ldr b1, [x12, #0x0]\n"
"57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
+ "ldr q17, [x16, #0x0]\n"
".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
@@ -724,79 +724,79 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 48b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x17, x20, LSL #2\n"
"cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"bge 67f\n"
"tbz x8, #3, 62f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"tbz x8, #2, 60f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"tbz x8, #1, 59f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"b 66f\n"
"59:" // Height 2: Partial direct writeback: partial_1_12
"tbz x8, #0, 66f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"b 66f\n"
"60:" // Height 2: Partial direct writeback: partial_2_8
"tbz x8, #1, 61f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"b 66f\n"
"61:" // Height 2: Partial direct writeback: partial_1_8
"tbz x8, #0, 66f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"b 66f\n"
"62:" // Height 2: Partial direct writeback: partial_4_0
"tbz x8, #2, 64f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"tbz x8, #1, 63f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"b 66f\n"
"63:" // Height 2: Partial direct writeback: partial_1_4
"tbz x8, #0, 66f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"b 66f\n"
"64:" // Height 2: Partial direct writeback: partial_2_0
"tbz x8, #1, 65f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"b 66f\n"
"65:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"66:" // Height 2: Partial direct writeback: Done
"b 68f\n"
"67:" // Height 2: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -807,107 +807,107 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 206f\n"
"69:" // Height 3
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"70:" // Height 3: Column loop
"tbz %x[flags], #0, 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x17, x20, LSL #2\n"
"cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"bge 79f\n"
"tbz x8, #3, 74f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"tbz x8, #2, 72f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"tbz x8, #1, 71f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"b 78f\n"
"71:" // Height 3: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 78f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"b 78f\n"
"72:" // Height 3: Partial accumulate: partial_2_8
"tbz x8, #1, 73f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"b 78f\n"
"73:" // Height 3: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 78f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"b 78f\n"
"74:" // Height 3: Partial accumulate: partial_4_0
"tbz x8, #2, 76f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"tbz x8, #1, 75f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"b 78f\n"
"75:" // Height 3: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 78f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"b 78f\n"
"76:" // Height 3: Partial accumulate: partial_2_0
"tbz x8, #1, 77f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"b 78f\n"
"77:" // Height 3: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
"78:" // Height 3: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 81f\n"
"79:" // Height 3: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -934,8 +934,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"mov x15, #0x0\n"
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -959,123 +959,123 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x14, #0x20\n"
"ldr q1, [x12, #0x0]\n"
"ldr q2, [x11, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 86f\n"
"85:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr d21, [x17, #0x20]\n"
+ "ldr d21, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x13, x13, #0x10\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x12, x12, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr d20, [x17, #0x30]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "add x11, x11, #0x10\n"
- "ldr x24, [x13, #0x8]\n"
+ "ldr d20, [x16, #0x30]\n"
"mov v20.d[1], x20\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr d21, [x17, #0x40]\n"
+ "ldr d21, [x16, #0x40]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
- "ldr x23, [x12, #0x8]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
- "ldr x22, [x11, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr d20, [x17, #0x50]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "sub x14, x14, #0x10\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr d20, [x16, #0x50]\n"
"mov v20.d[1], x20\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr d21, [x17, #0x60]\n"
+ "ldr d21, [x16, #0x60]\n"
".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
- "cmp x14, #0x20\n"
+ "mov v21.d[1], x21\n"
".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr d20, [x17, #0x70]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr d20, [x16, #0x70]\n"
"mov v20.d[1], x20\n"
".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr d21, [x17, #0x80]\n"
+ "ldr d21, [x16, #0x80]\n"
".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr d20, [x17, #0x90]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
+ "ldr d20, [x16, #0x90]\n"
"mov v20.d[1], x20\n"
".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
- "ldr x20, [x17, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr d21, [x17, #0xa0]\n"
+ "ldr d21, [x16, #0xa0]\n"
".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr d20, [x17, #0xb0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
+ "ldr d20, [x16, #0xb0]\n"
"mov v20.d[1], x20\n"
".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
- "ldr x20, [x17, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr d21, [x17, #0xc0]\n"
+ "ldr d21, [x16, #0xc0]\n"
".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr d20, [x17, #0xd0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
+ "ldr d20, [x16, #0xd0]\n"
"mov v20.d[1], x20\n"
".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
- "ldr x20, [x17, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr d21, [x17, #0xe0]\n"
+ "ldr d21, [x16, #0xe0]\n"
".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ "mov v21.d[1], x21\n"
".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ "add x13, x13, #0x10\n"
".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr d20, [x17, #0xf0]\n"
- "mov v21.d[1], x21\n"
- "add x17, x17, #0x100\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr d20, [x16, #0xf0]\n"
"mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0x8]\n"
".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
- "ldr x20, [x17, #0x18]\n"
+ "ldr x23, [x13, #0x8]\n"
".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
"ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"ldr d2, [x11, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x24\n"
- "mov v1.d[1], x23\n"
- "mov v2.d[1], x22\n"
+ "sub x14, x14, #0x10\n"
+ "ldr d7, [x16, #0x10]\n"
+ "cmp x14, #0x20\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x23\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"mov v7.d[1], x20\n"
"bge 85b\n"
"86:" // Height 3: Multiply loop: Single iteration only
@@ -1084,66 +1084,66 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x17, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x17, #0x30]\n"
+ "ldr q20, [x16, #0x30]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x17, #0x40]\n"
+ "ldr q21, [x16, #0x40]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x17, #0x50]\n"
+ "ldr q20, [x16, #0x50]\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x17, #0x60]\n"
+ "ldr q21, [x16, #0x60]\n"
".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x17, #0x70]\n"
+ "ldr q20, [x16, #0x70]\n"
".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x17, #0x80]\n"
+ "ldr q21, [x16, #0x80]\n"
".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x17, #0x90]\n"
+ "ldr q20, [x16, #0x90]\n"
".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x17, #0xa0]\n"
+ "ldr q21, [x16, #0xa0]\n"
".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x17, #0xb0]\n"
+ "ldr q20, [x16, #0xb0]\n"
".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x17, #0xc0]\n"
+ "ldr q21, [x16, #0xc0]\n"
".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x17, #0xd0]\n"
+ "ldr q20, [x16, #0xd0]\n"
".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x17, #0xe0]\n"
+ "ldr q21, [x16, #0xe0]\n"
".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x17, #0xf0]\n"
+ "ldr q20, [x16, #0xf0]\n"
".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
@@ -1159,18 +1159,18 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr s23, [x12], #0x4\n"
"cmp x14, #0x4\n"
"ldr s22, [x11], #0x4\n"
- "ldr q21, [x17, #0x0]\n"
- "ldr q20, [x17, #0x10]\n"
+ "ldr q21, [x16, #0x0]\n"
".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
- "ldr q21, [x17, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
- "ldr q20, [x17, #0x30]\n"
+ "ldr q20, [x16, #0x30]\n"
".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n"
@@ -1193,18 +1193,18 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr b1, [x12, #0x0]\n"
"ldr b2, [x11, #0x0]\n"
"91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q21, [x17, #0x0]\n"
- "ldr q20, [x17, #0x10]\n"
+ "ldr q21, [x16, #0x0]\n"
".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x17, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x17, #0x30]\n"
+ "ldr q20, [x16, #0x30]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
@@ -1216,97 +1216,97 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 82b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"bge 101f\n"
"tbz x8, #3, 96f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v17.4s }, [x23], #0x10\n"
"tbz x8, #2, 94f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"tbz x8, #1, 93f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"b 100f\n"
"93:" // Height 3: Partial direct writeback: partial_1_12
"tbz x8, #0, 100f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"b 100f\n"
"94:" // Height 3: Partial direct writeback: partial_2_8
"tbz x8, #1, 95f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"b 100f\n"
"95:" // Height 3: Partial direct writeback: partial_1_8
"tbz x8, #0, 100f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"b 100f\n"
"96:" // Height 3: Partial direct writeback: partial_4_0
"tbz x8, #2, 98f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"tbz x8, #1, 97f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"b 100f\n"
"97:" // Height 3: Partial direct writeback: partial_1_4
"tbz x8, #0, 100f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"b 100f\n"
"98:" // Height 3: Partial direct writeback: partial_2_0
"tbz x8, #1, 99f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"b 100f\n"
"99:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"100:" // Height 3: Partial direct writeback: Done
"b 102f\n"
"101:" // Height 3: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -1321,38 +1321,38 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 206f\n"
"103:" // Height 4
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"104:" // Height 4: Column loop
"tbz %x[flags], #0, 114f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
"add x22, x23, x20, LSL #2\n"
"bge 113f\n"
"tbz x8, #3, 108f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"ld1 { v21.4s }, [x22], #0x10\n"
"tbz x8, #2, 106f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"ld1 { v22.4s }, [x22], #0x10\n"
"tbz x8, #1, 105f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
"ldr d23, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"ld1 { v23.s }[2], [x22]\n"
@@ -1360,20 +1360,20 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"105:" // Height 4: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 112f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"ldr s23, [x22, #0x0]\n"
"b 112f\n"
"106:" // Height 4: Partial accumulate: partial_2_8
"tbz x8, #1, 107f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
"ldr d22, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"ld1 { v22.s }[2], [x22]\n"
@@ -1381,25 +1381,25 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"107:" // Height 4: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 112f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"ldr s22, [x22, #0x0]\n"
"b 112f\n"
"108:" // Height 4: Partial accumulate: partial_4_0
"tbz x8, #2, 110f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"tbz x8, #1, 109f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
"ldr d21, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"ld1 { v21.s }[2], [x22]\n"
@@ -1407,38 +1407,38 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"109:" // Height 4: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 112f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"ldr s21, [x22, #0x0]\n"
"b 112f\n"
"110:" // Height 4: Partial accumulate: partial_2_0
"tbz x8, #1, 111f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
"ldr d20, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"ld1 { v20.s }[2], [x22]\n"
"b 112f\n"
"111:" // Height 4: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
"ldr s20, [x22, #0x0]\n"
"112:" // Height 4: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 115f\n"
"113:" // Height 4: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -1473,8 +1473,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"mov x15, #0x0\n"
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1502,129 +1502,130 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr q1, [x12, #0x0]\n"
"ldr q2, [x11, #0x0]\n"
"ldr q3, [x10, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 120f\n"
"119:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr d25, [x17, #0x20]\n"
+ "ldr d25, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x12, x12, #0x10\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr d24, [x17, #0x30]\n"
+ "ldr d24, [x16, #0x30]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
- "ldr x20, [x17, #0x48]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr d25, [x17, #0x40]\n"
+ "ldr d25, [x16, #0x40]\n"
".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
- "ldr x25, [x13, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr d24, [x17, #0x50]\n"
+ "ldr d24, [x16, #0x50]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
- "ldr x20, [x17, #0x68]\n"
".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
- "ldr x24, [x12, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ "ldr x25, [x13, #0x8]\n"
".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr d25, [x17, #0x60]\n"
+ "ldr d25, [x16, #0x60]\n"
".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
- "ldr x23, [x11, #0x8]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x24, [x12, #0x8]\n"
".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr d24, [x17, #0x70]\n"
+ "ldr d24, [x16, #0x70]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
- "ldr x20, [x17, #0x88]\n"
".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
- "ldr x22, [x10, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ "ldr x23, [x11, #0x8]\n"
".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr d25, [x17, #0x80]\n"
+ "ldr d25, [x16, #0x80]\n"
".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
- "sub x14, x14, #0x10\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x22, [x10, #0x8]\n"
".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr d24, [x17, #0x90]\n"
+ "ldr d24, [x16, #0x90]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
- "ldr x20, [x17, #0xa8]\n"
".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
- "cmp x14, #0x20\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ "sub x14, x14, #0x10\n"
".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr d25, [x17, #0xa0]\n"
+ "ldr d25, [x16, #0xa0]\n"
".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
- "mov v25.d[1], x20\n"
+ "cmp x14, #0x20\n"
".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr d24, [x17, #0xb0]\n"
+ "ldr d24, [x16, #0xb0]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
- "ldr x20, [x17, #0xc8]\n"
".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr d25, [x17, #0xc0]\n"
+ "ldr d25, [x16, #0xc0]\n"
".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
- "mov v25.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr d24, [x17, #0xd0]\n"
+ "ldr d24, [x16, #0xd0]\n"
+ "mov v24.d[1], x20\n"
".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
- "ldr x20, [x17, #0xe8]\n"
".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr d25, [x17, #0xe0]\n"
+ "ldr d25, [x16, #0xe0]\n"
".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
- "mov v25.d[1], x20\n"
".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr d24, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr d24, [x16, #0xf0]\n"
+ "mov v24.d[1], x20\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
@@ -1633,9 +1634,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d2, [x11, #0x0]\n"
".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"ldr d3, [x10, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x17, #0x18]\n"
+ "ldr d7, [x16, #0x10]\n"
+ "mov v6.d[1], x21\n"
"mov v0.d[1], x25\n"
"mov v1.d[1], x24\n"
"mov v2.d[1], x23\n"
@@ -1650,7 +1650,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x17, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1658,7 +1658,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x17, #0x30]\n"
+ "ldr q24, [x16, #0x30]\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
@@ -1666,64 +1666,64 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x17, #0x40]\n"
+ "ldr q25, [x16, #0x40]\n"
".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x17, #0x50]\n"
+ "ldr q24, [x16, #0x50]\n"
".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x17, #0x60]\n"
+ "ldr q25, [x16, #0x60]\n"
".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x17, #0x70]\n"
+ "ldr q24, [x16, #0x70]\n"
".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x17, #0x80]\n"
+ "ldr q25, [x16, #0x80]\n"
".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x17, #0x90]\n"
+ "ldr q24, [x16, #0x90]\n"
".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x17, #0xa0]\n"
+ "ldr q25, [x16, #0xa0]\n"
".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x17, #0xb0]\n"
+ "ldr q24, [x16, #0xb0]\n"
".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x17, #0xc0]\n"
+ "ldr q25, [x16, #0xc0]\n"
".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x17, #0xd0]\n"
+ "ldr q24, [x16, #0xd0]\n"
".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x17, #0xe0]\n"
+ "ldr q25, [x16, #0xe0]\n"
".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x17, #0xf0]\n"
+ "ldr q24, [x16, #0xf0]\n"
".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
@@ -1742,20 +1742,20 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x14, #0x4\n"
"ldr s27, [x11], #0x4\n"
"ldr s26, [x10], #0x4\n"
- "ldr q25, [x17, #0x0]\n"
- "ldr q24, [x17, #0x10]\n"
+ "ldr q25, [x16, #0x0]\n"
".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
- "ldr q25, [x17, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
- "ldr q24, [x17, #0x30]\n"
+ "ldr q24, [x16, #0x30]\n"
".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n"
@@ -1783,20 +1783,20 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr b2, [x11, #0x0]\n"
"ldr b3, [x10, #0x0]\n"
"125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q25, [x17, #0x0]\n"
- "ldr q24, [x17, #0x10]\n"
+ "ldr q25, [x16, #0x0]\n"
".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x17, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x17, #0x30]\n"
+ "ldr q24, [x16, #0x30]\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
@@ -1810,18 +1810,18 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 116b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"bge 135f\n"
"tbz x8, #3, 130f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
@@ -1829,96 +1829,96 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"st1 { v20.4s }, [x22], #0x10\n"
"st1 { v21.4s }, [x22], #0x10\n"
"tbz x8, #2, 128f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"st1 { v22.4s }, [x22], #0x10\n"
"tbz x8, #1, 127f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"str d23, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v23.s }[2], [x22]\n"
"b 134f\n"
"127:" // Height 4: Partial direct writeback: partial_1_12
"tbz x8, #0, 134f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"str s23, [x22, #0x0]\n"
"b 134f\n"
"128:" // Height 4: Partial direct writeback: partial_2_8
"tbz x8, #1, 129f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"str d22, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"st1 { v22.s }[2], [x22]\n"
"b 134f\n"
"129:" // Height 4: Partial direct writeback: partial_1_8
"tbz x8, #0, 134f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"str s22, [x22, #0x0]\n"
"b 134f\n"
"130:" // Height 4: Partial direct writeback: partial_4_0
"tbz x8, #2, 132f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v20.4s }, [x22], #0x10\n"
"tbz x8, #1, 131f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"str d21, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"st1 { v21.s }[2], [x22]\n"
"b 134f\n"
"131:" // Height 4: Partial direct writeback: partial_1_4
"tbz x8, #0, 134f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"str s21, [x22, #0x0]\n"
"b 134f\n"
"132:" // Height 4: Partial direct writeback: partial_2_0
"tbz x8, #1, 133f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"str d20, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
"b 134f\n"
"133:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"str s20, [x22, #0x0]\n"
"134:" // Height 4: Partial direct writeback: Done
"b 136f\n"
"135:" // Height 4: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -1937,43 +1937,43 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 206f\n"
"137:" // Height 5
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"138:" // Height 5: Column loop
"tbz %x[flags], #0, 148f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
"add x21, x22, x20, LSL #2\n"
"bge 147f\n"
"tbz x8, #3, 142f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v25.4s }, [x21], #0x10\n"
"tbz x8, #2, 140f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v26.4s }, [x21], #0x10\n"
"tbz x8, #1, 139f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
"ldr d23, [x22], #0x8\n"
"ldr d27, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"ld1 { v23.s }[2], [x22]\n"
@@ -1982,7 +1982,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"139:" // Height 5: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 146f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"ldr s23, [x22, #0x0]\n"
@@ -1990,14 +1990,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 146f\n"
"140:" // Height 5: Partial accumulate: partial_2_8
"tbz x8, #1, 141f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
"ldr d22, [x22], #0x8\n"
"ldr d26, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"ld1 { v22.s }[2], [x22]\n"
@@ -2006,7 +2006,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"141:" // Height 5: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 146f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"ldr s22, [x22, #0x0]\n"
@@ -2014,20 +2014,20 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 146f\n"
"142:" // Height 5: Partial accumulate: partial_4_0
"tbz x8, #2, 144f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
"tbz x8, #1, 143f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
"ldr d21, [x22], #0x8\n"
"ldr d25, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"ld1 { v21.s }[2], [x22]\n"
@@ -2036,7 +2036,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"143:" // Height 5: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 146f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"ldr s21, [x22, #0x0]\n"
@@ -2044,34 +2044,34 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 146f\n"
"144:" // Height 5: Partial accumulate: partial_2_0
"tbz x8, #1, 145f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
"ldr d20, [x22], #0x8\n"
"ldr d24, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"ld1 { v20.s }[2], [x22]\n"
"ld1 { v24.s }[2], [x21]\n"
"b 146f\n"
"145:" // Height 5: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
"ldr s20, [x22, #0x0]\n"
"ldr s24, [x21, #0x0]\n"
"146:" // Height 5: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 149f\n"
"147:" // Height 5: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -2114,8 +2114,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"mov x15, #0x0\n"
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2147,148 +2147,148 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr q2, [x11, #0x0]\n"
"ldr q3, [x10, #0x0]\n"
"ldr q4, [x9, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 154f\n"
"153:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr d29, [x17, #0x20]\n"
+ "ldr d29, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x11, x11, #0x10\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v29.d[1], x21\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr d28, [x17, #0x30]\n"
+ "ldr d28, [x16, #0x30]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
- "ldr x26, [x13, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x13, #0x8]\n"
".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr d29, [x17, #0x40]\n"
+ "ldr d29, [x16, #0x40]\n"
".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
- "ldr x25, [x12, #0x8]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
- "ldr x24, [x11, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
- "mov v29.d[1], x21\n"
+ "ldr x25, [x12, #0x8]\n"
".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x24, [x11, #0x8]\n"
".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr d28, [x17, #0x50]\n"
+ "ldr d28, [x16, #0x50]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
- "ldr x23, [x10, #0x8]\n"
".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
- "ldr x22, [x9, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ "ldr x23, [x10, #0x8]\n"
".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "ldr x22, [x9, #0x8]\n"
".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr d29, [x17, #0x60]\n"
+ "ldr d29, [x16, #0x60]\n"
".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
- "sub x14, x14, #0x10\n"
+ "mov v29.d[1], x21\n"
".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
- "cmp x14, #0x20\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
- "mov v29.d[1], x21\n"
+ "sub x14, x14, #0x10\n"
".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "cmp x14, #0x20\n"
".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr d28, [x17, #0x70]\n"
+ "ldr d28, [x16, #0x70]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr d29, [x17, #0x80]\n"
+ "ldr d29, [x16, #0x80]\n"
".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
- "mov v29.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0xa8]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr d28, [x17, #0x90]\n"
+ "ldr d28, [x16, #0x90]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xb8]\n"
".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr d29, [x17, #0xa0]\n"
+ "ldr d29, [x16, #0xa0]\n"
".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
- "mov v29.d[1], x21\n"
".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xc8]\n"
".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr d28, [x17, #0xb0]\n"
+ "ldr d28, [x16, #0xb0]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xd8]\n"
".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr d29, [x17, #0xc0]\n"
+ "ldr d29, [x16, #0xc0]\n"
".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
- "mov v29.d[1], x21\n"
".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xe8]\n"
".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr d28, [x17, #0xd0]\n"
+ "ldr d28, [x16, #0xd0]\n"
+ "mov v28.d[1], x20\n"
".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0xf8]\n"
".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr d29, [x17, #0xe0]\n"
+ "ldr d29, [x16, #0xe0]\n"
".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ "mov v29.d[1], x21\n"
".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
- "mov v29.d[1], x21\n"
".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr d28, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr d28, [x16, #0xf0]\n"
+ "mov v28.d[1], x20\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
- "ldr x21, [x17, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0x18]\n"
".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
@@ -2299,7 +2299,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d3, [x10, #0x0]\n"
".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"ldr d4, [x9, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
+ "ldr d7, [x16, #0x10]\n"
"mov v6.d[1], x21\n"
"mov v0.d[1], x26\n"
"mov v1.d[1], x25\n"
@@ -2318,7 +2318,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x17, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x9, x9, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2328,7 +2328,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x17, #0x30]\n"
+ "ldr q28, [x16, #0x30]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
@@ -2337,75 +2337,75 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x17, #0x40]\n"
+ "ldr q29, [x16, #0x40]\n"
".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x17, #0x50]\n"
+ "ldr q28, [x16, #0x50]\n"
".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x17, #0x60]\n"
+ "ldr q29, [x16, #0x60]\n"
".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x17, #0x70]\n"
+ "ldr q28, [x16, #0x70]\n"
".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x17, #0x80]\n"
+ "ldr q29, [x16, #0x80]\n"
".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x17, #0x90]\n"
+ "ldr q28, [x16, #0x90]\n"
".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x17, #0xa0]\n"
+ "ldr q29, [x16, #0xa0]\n"
".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x17, #0xb0]\n"
+ "ldr q28, [x16, #0xb0]\n"
".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x17, #0xc0]\n"
+ "ldr q29, [x16, #0xc0]\n"
".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x17, #0xd0]\n"
+ "ldr q28, [x16, #0xd0]\n"
".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x17, #0xe0]\n"
+ "ldr q29, [x16, #0xe0]\n"
".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x17, #0xf0]\n"
+ "ldr q28, [x16, #0xf0]\n"
".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
@@ -2427,22 +2427,22 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr s0, [x11], #0x4\n"
"ldr s31, [x10], #0x4\n"
"ldr s30, [x9], #0x4\n"
- "ldr q29, [x17, #0x0]\n"
- "ldr q28, [x17, #0x10]\n"
+ "ldr q29, [x16, #0x0]\n"
".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
- "ldr q29, [x17, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
- "ldr q28, [x17, #0x30]\n"
+ "ldr q28, [x16, #0x30]\n"
".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n"
@@ -2475,22 +2475,22 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr b3, [x10, #0x0]\n"
"ldr b4, [x9, #0x0]\n"
"159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q29, [x17, #0x0]\n"
- "ldr q28, [x17, #0x10]\n"
+ "ldr q29, [x16, #0x0]\n"
".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x17, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x17, #0x30]\n"
+ "ldr q28, [x16, #0x30]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
@@ -2506,20 +2506,20 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 150b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"bge 169f\n"
"tbz x8, #3, 164f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
@@ -2529,19 +2529,19 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"st1 { v24.4s }, [x21], #0x10\n"
"st1 { v25.4s }, [x21], #0x10\n"
"tbz x8, #2, 162f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"st1 { v22.4s }, [x22], #0x10\n"
"st1 { v26.4s }, [x21], #0x10\n"
"tbz x8, #1, 161f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"str d23, [x22], #0x8\n"
"str d27, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v23.s }[2], [x22]\n"
@@ -2549,7 +2549,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 168f\n"
"161:" // Height 5: Partial direct writeback: partial_1_12
"tbz x8, #0, 168f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"str s23, [x22, #0x0]\n"
@@ -2557,13 +2557,13 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 168f\n"
"162:" // Height 5: Partial direct writeback: partial_2_8
"tbz x8, #1, 163f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"str d22, [x22], #0x8\n"
"str d26, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"st1 { v22.s }[2], [x22]\n"
@@ -2571,7 +2571,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 168f\n"
"163:" // Height 5: Partial direct writeback: partial_1_8
"tbz x8, #0, 168f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"str s22, [x22, #0x0]\n"
@@ -2579,19 +2579,19 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 168f\n"
"164:" // Height 5: Partial direct writeback: partial_4_0
"tbz x8, #2, 166f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v20.4s }, [x22], #0x10\n"
"st1 { v24.4s }, [x21], #0x10\n"
"tbz x8, #1, 165f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"str d21, [x22], #0x8\n"
"str d25, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"st1 { v21.s }[2], [x22]\n"
@@ -2599,7 +2599,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 168f\n"
"165:" // Height 5: Partial direct writeback: partial_1_4
"tbz x8, #0, 168f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"str s21, [x22, #0x0]\n"
@@ -2607,20 +2607,20 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 168f\n"
"166:" // Height 5: Partial direct writeback: partial_2_0
"tbz x8, #1, 167f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"str d20, [x22], #0x8\n"
"str d24, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
"st1 { v24.s }[2], [x21]\n"
"b 168f\n"
"167:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"str s20, [x22, #0x0]\n"
@@ -2628,11 +2628,11 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"168:" // Height 5: Partial direct writeback: Done
"b 170f\n"
"169:" // Height 5: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -2656,43 +2656,42 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"171:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x18\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x16\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"172:" // Height 6: Column loop
"tbz %x[flags], #0, 182f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
"add x20, x21, x20, LSL #2\n"
"bge 181f\n"
"tbz x8, #3, 176f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
"ld1 { v28.4s }, [x20], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v25.4s }, [x21], #0x10\n"
"ld1 { v29.4s }, [x20], #0x10\n"
"tbz x8, #2, 174f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v30.4s }, [x20], #0x10\n"
"tbz x8, #1, 173f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
@@ -2700,7 +2699,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d27, [x21], #0x8\n"
"ldr d31, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"ld1 { v23.s }[2], [x22]\n"
@@ -2710,7 +2709,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"173:" // Height 6: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 180f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"ldr s23, [x22, #0x0]\n"
@@ -2719,7 +2718,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 180f\n"
"174:" // Height 6: Partial accumulate: partial_2_8
"tbz x8, #1, 175f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
@@ -2727,7 +2726,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d26, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"ld1 { v22.s }[2], [x22]\n"
@@ -2737,7 +2736,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"175:" // Height 6: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 180f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"ldr s22, [x22, #0x0]\n"
@@ -2746,14 +2745,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 180f\n"
"176:" // Height 6: Partial accumulate: partial_4_0
"tbz x8, #2, 178f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
"ld1 { v28.4s }, [x20], #0x10\n"
"tbz x8, #1, 177f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
@@ -2761,7 +2760,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d25, [x21], #0x8\n"
"ldr d29, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"ld1 { v21.s }[2], [x22]\n"
@@ -2771,7 +2770,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"177:" // Height 6: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 180f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"ldr s21, [x22, #0x0]\n"
@@ -2780,7 +2779,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 180f\n"
"178:" // Height 6: Partial accumulate: partial_2_0
"tbz x8, #1, 179f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
@@ -2788,7 +2787,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d24, [x21], #0x8\n"
"ldr d28, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"ld1 { v20.s }[2], [x22]\n"
@@ -2796,7 +2795,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ld1 { v28.s }[2], [x20]\n"
"b 180f\n"
"179:" // Height 6: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
@@ -2804,13 +2803,13 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr s24, [x21, #0x0]\n"
"ldr s28, [x20, #0x0]\n"
"180:" // Height 6: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 183f\n"
"181:" // Height 6: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -2861,8 +2860,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"mov x15, #0x0\n"
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2898,14 +2897,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr q3, [x10, #0x0]\n"
"ldr q4, [x9, #0x0]\n"
"ldr q5, [x28, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 188f\n"
"187:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
@@ -2913,151 +2912,151 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d6, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x21\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
"add x28, x28, #0x10\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr d7, [x17, #0x30]\n"
+ "ldr d7, [x16, #0x30]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr x27, [x13, #0x8]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x26, [x12, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr x27, [x13, #0x8]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
"ldr x25, [x11, #0x8]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr d6, [x17, #0x40]\n"
+ "ldr d6, [x16, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr x24, [x10, #0x8]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x21\n"
+ "ldr x24, [x10, #0x8]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
"ldr x22, [x28, #0x8]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr d7, [x17, #0x50]\n"
+ "ldr d7, [x16, #0x50]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "sub x14, x14, #0x10\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "cmp x14, #0x20\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "sub x14, x14, #0x10\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "cmp x14, #0x20\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr d6, [x17, #0x60]\n"
+ "ldr d6, [x16, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "mov v6.d[1], x21\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr d7, [x17, #0x70]\n"
+ "ldr d7, [x16, #0x70]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr d6, [x17, #0x80]\n"
+ "ldr d6, [x16, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "mov v6.d[1], x21\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0xa8]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr d7, [x17, #0x90]\n"
+ "ldr d7, [x16, #0x90]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xb8]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr d6, [x17, #0xa0]\n"
+ "ldr d6, [x16, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "mov v6.d[1], x21\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xc8]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr d7, [x17, #0xb0]\n"
+ "ldr d7, [x16, #0xb0]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xd8]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr d6, [x17, #0xc0]\n"
+ "ldr d6, [x16, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "mov v6.d[1], x21\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xe8]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr d7, [x17, #0xd0]\n"
+ "ldr d7, [x16, #0xd0]\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0xf8]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr d6, [x17, #0xe0]\n"
+ "ldr d6, [x16, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "mov v6.d[1], x21\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr d7, [x16, #0xf0]\n"
+ "mov v7.d[1], x20\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x21, [x17, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0x18]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
@@ -3070,7 +3069,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr d4, [x9, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
"ldr d5, [x28, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
+ "ldr d7, [x16, #0x10]\n"
"mov v6.d[1], x21\n"
"mov v0.d[1], x27\n"
"mov v1.d[1], x26\n"
@@ -3092,7 +3091,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
"add x9, x9, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q6, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x28, x28, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -3104,7 +3103,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x17, #0x30]\n"
+ "ldr q7, [x16, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
@@ -3114,86 +3113,86 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x17, #0x40]\n"
+ "ldr q6, [x16, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x17, #0x50]\n"
+ "ldr q7, [x16, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x17, #0x60]\n"
+ "ldr q6, [x16, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x17, #0x70]\n"
+ "ldr q7, [x16, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x17, #0x80]\n"
+ "ldr q6, [x16, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x17, #0x90]\n"
+ "ldr q7, [x16, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x17, #0xa0]\n"
+ "ldr q6, [x16, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x17, #0xb0]\n"
+ "ldr q7, [x16, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x17, #0xc0]\n"
+ "ldr q6, [x16, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x17, #0xd0]\n"
+ "ldr q7, [x16, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x17, #0xe0]\n"
+ "ldr q6, [x16, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x17, #0xf0]\n"
+ "ldr q7, [x16, #0xf0]\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
@@ -3218,24 +3217,24 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr s4, [x10], #0x4\n"
"ldr s3, [x9], #0x4\n"
"ldr s2, [x28], #0x4\n"
- "ldr q1, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
+ "ldr q1, [x16, #0x0]\n"
".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
+ "ldr q0, [x16, #0x10]\n"
".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
- "ldr q1, [x17, #0x20]\n"
+ "ldr q1, [x16, #0x20]\n"
".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
- "ldr q0, [x17, #0x30]\n"
+ "ldr q0, [x16, #0x30]\n"
".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n"
@@ -3273,24 +3272,24 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"ldr b4, [x9, #0x0]\n"
"ldr b5, [x28, #0x0]\n"
"193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x17, #0x0]\n"
- "ldr q6, [x17, #0x10]\n"
+ "ldr q7, [x16, #0x0]\n"
".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x16, #0x10]\n"
".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x17, #0x20]\n"
+ "ldr q7, [x16, #0x20]\n"
".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x17, #0x30]\n"
+ "ldr q6, [x16, #0x30]\n"
".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n"
@@ -3308,22 +3307,22 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 184b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add x20, x21, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"prfm pstl1keep, [x20, #0x0]\n"
"bge 203f\n"
"tbz x8, #3, 198f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
@@ -3335,21 +3334,21 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"st1 { v28.4s }, [x20], #0x10\n"
"st1 { v29.4s }, [x20], #0x10\n"
"tbz x8, #2, 196f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"st1 { v22.4s }, [x22], #0x10\n"
"st1 { v26.4s }, [x21], #0x10\n"
"st1 { v30.4s }, [x20], #0x10\n"
"tbz x8, #1, 195f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"str d23, [x22], #0x8\n"
"str d27, [x21], #0x8\n"
"str d31, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v23.s }[2], [x22]\n"
@@ -3358,7 +3357,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 202f\n"
"195:" // Height 6: Partial direct writeback: partial_1_12
"tbz x8, #0, 202f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"str s23, [x22, #0x0]\n"
@@ -3367,14 +3366,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 202f\n"
"196:" // Height 6: Partial direct writeback: partial_2_8
"tbz x8, #1, 197f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"str d22, [x22], #0x8\n"
"str d26, [x21], #0x8\n"
"str d30, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"st1 { v22.s }[2], [x22]\n"
@@ -3383,7 +3382,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 202f\n"
"197:" // Height 6: Partial direct writeback: partial_1_8
"tbz x8, #0, 202f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"str s22, [x22, #0x0]\n"
@@ -3392,21 +3391,21 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 202f\n"
"198:" // Height 6: Partial direct writeback: partial_4_0
"tbz x8, #2, 200f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v20.4s }, [x22], #0x10\n"
"st1 { v24.4s }, [x21], #0x10\n"
"st1 { v28.4s }, [x20], #0x10\n"
"tbz x8, #1, 199f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"str d21, [x22], #0x8\n"
"str d25, [x21], #0x8\n"
"str d29, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"st1 { v21.s }[2], [x22]\n"
@@ -3415,7 +3414,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 202f\n"
"199:" // Height 6: Partial direct writeback: partial_1_4
"tbz x8, #0, 202f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"str s21, [x22, #0x0]\n"
@@ -3424,14 +3423,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"b 202f\n"
"200:" // Height 6: Partial direct writeback: partial_2_0
"tbz x8, #1, 201f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"str d20, [x22], #0x8\n"
"str d24, [x21], #0x8\n"
"str d28, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
@@ -3439,7 +3438,7 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"st1 { v28.s }[2], [x20]\n"
"b 202f\n"
"201:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"str s20, [x22, #0x0]\n"
@@ -3448,11 +3447,11 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"202:" // Height 6: Partial direct writeback: Done
"b 204f\n"
"203:" // Height 6: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -3488,8 +3487,8 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
index 0950d7d950..ba2f77f541 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -44,18 +44,18 @@ void a64_hybrid_s8s32_dot_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -88,7 +88,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"beq 35f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"tbz %x[flags], #0, 12f\n"
"cmp x11, #0x10\n"
@@ -163,8 +163,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"mov x28, #0x0\n"
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -188,10 +188,6 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr q17, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
@@ -216,21 +212,22 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr q17, [x10, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
+ "cmp x27, #0x20\n"
+ "add x10, x10, #0x100\n"
+ "ldr q6, [x10, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
@@ -255,26 +252,29 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr q17, [x10, #0xe0]\n"
".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 24f\n"
"cmp x27, #0x4\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x26], #0x4\n"
- "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
"sub x27, x27, #0x4\n"
"ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f92e228 // sdot v8.4s, v17.16b, v18.4b[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
+ "cmp x27, #0x4\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
+ "add x10, x10, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 24f\n"
@@ -289,12 +289,12 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr q17, [x10, #0x0]\n"
"ldr q16, [x10, #0x10]\n"
".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "add x10, x10, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -362,7 +362,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"35:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"36:" // Height 2: Column loop
"tbz %x[flags], #0, 46f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -463,8 +463,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"mov x28, #0x0\n"
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -496,22 +496,22 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "cmp x27, #0x20\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
"ldr q16, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
"ldr q17, [x10, #0x80]\n"
@@ -555,18 +555,18 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
"ldr q16, [x10, #0x70]\n"
@@ -607,18 +607,18 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr s19, [x26], #0x4\n"
"ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr q17, [x10, #0x0]\n"
"ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
"bge 54b\n"
@@ -643,9 +643,9 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"58:" // Height 2: Multiply loop: No odd multiplies
@@ -654,9 +654,9 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x28, x20\n"
"bne 48b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
"prfm pstl1keep, [x24, #0x0]\n"
"bge 67f\n"
"tbz x11, #3, 62f\n"
@@ -738,12 +738,12 @@ void a64_hybrid_s8s32_dot_6x16 (
"69:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"70:" // Height 3: Column loop
"tbz %x[flags], #0, 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x23, x24, x20, LSL #2\n"
"bge 79f\n"
"tbz x11, #3, 74f\n"
@@ -864,8 +864,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"mov x28, #0x0\n"
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -905,18 +905,18 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
@@ -983,14 +983,14 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x50]\n"
@@ -1049,12 +1049,12 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr s24, [x26], #0x4\n"
"ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s22, [x24], #0x4\n"
"ldr q21, [x10, #0x0]\n"
- "cmp x27, #0x4\n"
- "ldr q20, [x10, #0x10]\n"
".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
+ "ldr q20, [x10, #0x10]\n"
".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
"ldr q21, [x10, #0x20]\n"
".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
@@ -1108,11 +1108,11 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x28, x20\n"
"bne 82b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"bge 101f\n"
"tbz x11, #3, 96f\n"
@@ -1214,13 +1214,13 @@ void a64_hybrid_s8s32_dot_6x16 (
"103:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"104:" // Height 4: Column loop
"tbz %x[flags], #0, 114f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x22, x23, x20, LSL #2\n"
"bge 113f\n"
"tbz x11, #3, 108f\n"
@@ -1365,8 +1365,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"mov x28, #0x0\n"
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1506,14 +1506,14 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
"ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
@@ -1591,9 +1591,9 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr s29, [x26], #0x4\n"
"ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s27, [x24], #0x4\n"
"ldr s26, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr q25, [x10, #0x0]\n"
"ldr q24, [x10, #0x10]\n"
".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
@@ -1662,13 +1662,13 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x28, x20\n"
"bne 116b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"bge 135f\n"
"tbz x11, #3, 130f\n"
@@ -1790,14 +1790,14 @@ void a64_hybrid_s8s32_dot_6x16 (
"137:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"138:" // Height 5: Column loop
"tbz %x[flags], #0, 148f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x21, x22, x20, LSL #2\n"
"bge 147f\n"
"tbz x11, #3, 142f\n"
@@ -1966,8 +1966,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"mov x28, #0x0\n"
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2133,12 +2133,12 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x22, x22, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "sub x27, x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
"ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
@@ -2233,14 +2233,14 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s0, [x24], #0x4\n"
"ldr s31, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr s30, [x22], #0x4\n"
"ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ "ldr q28, [x10, #0x10]\n"
".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
@@ -2316,15 +2316,15 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x28, x20\n"
"bne 150b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"bge 169f\n"
"tbz x11, #3, 164f\n"
@@ -2465,20 +2465,19 @@ void a64_hybrid_s8s32_dot_6x16 (
"b 206f\n"
"171:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"172:" // Height 6: Column loop
"tbz %x[flags], #0, 182f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x20, x21, x20, LSL #2\n"
"bge 181f\n"
"tbz x11, #3, 176f\n"
@@ -2671,8 +2670,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"mov x28, #0x0\n"
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2862,18 +2861,18 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
"ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2979,9 +2978,9 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr s7, [x26], #0x4\n"
"ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s5, [x24], #0x4\n"
"ldr s4, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr s3, [x22], #0x4\n"
"ldr s2, [x21], #0x4\n"
"ldr q1, [x10, #0x0]\n"
@@ -3074,16 +3073,16 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x28, x20\n"
"bne 184b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"add x20, x21, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"prfm pstl1keep, [x20, #0x0]\n"
"bge 203f\n"
@@ -3254,8 +3253,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
index c5170553d8..0f7481f0da 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
@@ -70,7 +70,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
index c6e982e20d..98b4d9b997 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
@@ -44,18 +44,18 @@ void a64_hybrid_s8s32_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -88,7 +88,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"beq 38f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"tbz %x[flags], #0, 13f\n"
"cmp x11, #0x10\n"
@@ -176,8 +176,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"mov x28, #0x0\n"
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -197,12 +197,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"trn1 v19.2d, v1.2d, v20.2d\n"
- "trn2 v1.2d, v1.2d, v20.2d\n"
".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
@@ -215,6 +210,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
@@ -231,38 +227,39 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "cmp x27, #0x20\n"
".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
- "ldr q7, [x10, #0x0]\n"
".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
+ "add x10, x10, #0x100\n"
+ "ldr q7, [x10, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "trn1 v19.2d, v1.2d, v17.2d\n"
- "trn2 v1.2d, v1.2d, v17.2d\n"
- ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
+ "trn1 v20.2d, v1.2d, v21.2d\n"
+ ".inst 0x4e87a688 // smmla v8.4s, v20.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
- ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
+ ".inst 0x4e86a68c // smmla v12.4s, v20.16b, v6.16b\n"
"ldr q17, [x10, #0x30]\n"
- ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e92a689 // smmla v9.4s, v20.16b, v18.16b\n"
"ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
+ ".inst 0x4e91a68d // smmla v13.4s, v20.16b, v17.16b\n"
"ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
- "ldr q20, [x10, #0x60]\n"
- ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ ".inst 0x4e92a68a // smmla v10.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x60]\n"
+ ".inst 0x4e91a68e // smmla v14.4s, v20.16b, v17.16b\n"
"ldr q18, [x10, #0x70]\n"
- ".inst 0x4e94a66b // smmla v11.4s, v19.16b, v20.16b\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x4e93a68b // smmla v11.4s, v20.16b, v19.16b\n"
"ldr q17, [x10, #0x80]\n"
- ".inst 0x4e92a66f // smmla v15.4s, v19.16b, v18.16b\n"
- "ldr q20, [x10, #0x90]\n"
+ ".inst 0x4e92a68f // smmla v15.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x90]\n"
".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
"ldr q18, [x10, #0xa0]\n"
- ".inst 0x4e94a42c // smmla v12.4s, v1.16b, v20.16b\n"
+ ".inst 0x4e93a42c // smmla v12.4s, v1.16b, v19.16b\n"
"ldr q17, [x10, #0xb0]\n"
".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n"
"ldr q18, [x10, #0xc0]\n"
@@ -272,21 +269,22 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 27f\n"
"cmp x27, #0x8\n"
"blt 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
"ldr d19, [x26], #0x8\n"
- "ldr q20, [x10, #0x0]\n"
- "sub x27, x27, #0x8\n"
+ "ldr q18, [x10, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
"ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v19.2d, v19.2d, v18.2d\n"
- ".inst 0x4e94a668 // smmla v8.4s, v19.16b, v20.16b\n"
+ ".inst 0x4e92a668 // smmla v8.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x4e91a66c // smmla v12.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x30]\n"
@@ -298,9 +296,11 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
+ "add x10, x10, #0x80\n"
"bge 21b\n"
"22:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 27f\n"
@@ -324,24 +324,24 @@ void a64_hybrid_s8s32_mmla_6x16 (
"25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x26, #0x0]\n"
"26:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q24, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
+ "ldr q23, [x10, #0x0]\n"
+ "ldr q18, [x10, #0x10]\n"
"trn1 v19.2d, v1.2d, v17.2d\n"
- ".inst 0x4e98a668 // smmla v8.4s, v19.16b, v24.16b\n"
+ ".inst 0x4e97a668 // smmla v8.4s, v19.16b, v23.16b\n"
"ldr q17, [x10, #0x20]\n"
- ".inst 0x4e94a66c // smmla v12.4s, v19.16b, v20.16b\n"
- "ldr q0, [x10, #0x30]\n"
+ ".inst 0x4e92a66c // smmla v12.4s, v19.16b, v18.16b\n"
+ "ldr q31, [x10, #0x30]\n"
".inst 0x4e91a669 // smmla v9.4s, v19.16b, v17.16b\n"
"ldr q20, [x10, #0x40]\n"
- ".inst 0x4e80a66d // smmla v13.4s, v19.16b, v0.16b\n"
+ ".inst 0x4e9fa66d // smmla v13.4s, v19.16b, v31.16b\n"
"ldr q17, [x10, #0x50]\n"
".inst 0x4e94a66a // smmla v10.4s, v19.16b, v20.16b\n"
"ldr q18, [x10, #0x60]\n"
".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
+ "add x10, x10, #0x80\n"
"27:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -413,7 +413,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"38:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"39:" // Height 2: Column loop
"tbz %x[flags], #0, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -523,8 +523,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"mov x28, #0x0\n"
"52:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 53f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -549,14 +549,6 @@ void a64_hybrid_s8s32_mmla_6x16 (
"blt 56f\n"
"55:" // Height 2: Multiply loop: Main loop head
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q2, [x25, #0x0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
@@ -569,6 +561,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
@@ -585,21 +578,22 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "ldr q2, [x25, #0x0]\n"
+ "cmp x27, #0x20\n"
".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
+ "add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"bge 55b\n"
"56:" // Height 2: Multiply loop: Single iteration only
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
@@ -612,6 +606,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
@@ -628,36 +623,41 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x10, x10, #0x100\n"
"57:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 64f\n"
"cmp x27, #0x8\n"
"blt 59f\n"
"58:" // Height 2: Multiply loop: Odd block loop
- "ldr d20, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x8\n"
- "ldr q18, [x10, #0x0]\n"
- "ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v22.2d, v20.2d, v19.2d\n"
- ".inst 0x4e92a6c8 // smmla v8.4s, v22.16b, v18.16b\n"
- "ldr q2, [x10, #0x20]\n"
- ".inst 0x4e91a6cc // smmla v12.4s, v22.16b, v17.16b\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x10]\n"
+ ".inst 0x4e91a668 // smmla v8.4s, v19.16b, v17.16b\n"
+ ".inst 0x4e96a66c // smmla v12.4s, v19.16b, v22.16b\n"
+ "ldr q1, [x10, #0x20]\n"
"ldr q17, [x10, #0x30]\n"
- ".inst 0x4e82a6c9 // smmla v9.4s, v22.16b, v2.16b\n"
+ ".inst 0x4e81a669 // smmla v9.4s, v19.16b, v1.16b\n"
+ ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
"ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91a6cd // smmla v13.4s, v22.16b, v17.16b\n"
"ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92a6ca // smmla v10.4s, v22.16b, v18.16b\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91a6ce // smmla v14.4s, v22.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "cmp x27, #0x8\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e92a6cb // smmla v11.4s, v22.16b, v18.16b\n"
- ".inst 0x4e91a6cf // smmla v15.4s, v22.16b, v17.16b\n"
"bge 58b\n"
"59:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 64f\n"
@@ -703,27 +703,27 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
+ "add x10, x10, #0x80\n"
"64:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 52b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
"prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"bge 73f\n"
"tbz x11, #3, 68f\n"
"st1 { v7.4s }, [x9], #0x10\n"
@@ -804,12 +804,12 @@ void a64_hybrid_s8s32_mmla_6x16 (
"75:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"76:" // Height 3: Column loop
"tbz %x[flags], #0, 87f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x23, x24, x20, LSL #2\n"
"bge 85f\n"
"tbz x11, #3, 80f\n"
@@ -951,8 +951,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"mov x28, #0x0\n"
"89:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 90f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -982,38 +982,35 @@ void a64_hybrid_s8s32_mmla_6x16 (
"92:" // Height 3: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
@@ -1021,12 +1018,15 @@ void a64_hybrid_s8s32_mmla_6x16 (
".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xd0]\n"
".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n"
@@ -1048,43 +1048,43 @@ void a64_hybrid_s8s32_mmla_6x16 (
"93:" // Height 3: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x90]\n"
".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
@@ -1109,25 +1109,25 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 96f\n"
"95:" // Height 3: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d27, [x24], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
"ldr q26, [x10, #0x0]\n"
- "cmp x27, #0x8\n"
- "ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v27.2d, v29.2d\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n"
".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "sub x27, x27, #0x8\n"
".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ "cmp x27, #0x8\n"
".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
@@ -1136,8 +1136,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ "add x10, x10, #0x80\n"
".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
@@ -1183,9 +1183,9 @@ void a64_hybrid_s8s32_mmla_6x16 (
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9da78c // smmla v12.4s, v28.16b, v29.16b\n"
".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e9da78c // smmla v12.4s, v28.16b, v29.16b\n"
".inst 0x4e9da774 // smmla v20.4s, v27.16b, v29.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
@@ -1211,20 +1211,20 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x28, x20\n"
"bne 89b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "cmp x11, #0x10\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
"prfm pstl1keep, [x9, #0x0]\n"
+ "uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
@@ -1329,13 +1329,13 @@ void a64_hybrid_s8s32_mmla_6x16 (
"112:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"113:" // Height 4: Column loop
"tbz %x[flags], #0, 124f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x22, x23, x20, LSL #2\n"
"bge 122f\n"
"tbz x11, #3, 117f\n"
@@ -1497,8 +1497,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"mov x28, #0x0\n"
"126:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 127f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1532,38 +1532,33 @@ void a64_hybrid_s8s32_mmla_6x16 (
"129:" // Height 4: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ "add x23, x23, #0x10\n"
+ "ldr q4, [x23, #0x0]\n"
".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
@@ -1574,18 +1569,23 @@ void a64_hybrid_s8s32_mmla_6x16 (
".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xd0]\n"
".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xe0]\n"
".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
@@ -1601,48 +1601,48 @@ void a64_hybrid_s8s32_mmla_6x16 (
"130:" // Height 4: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "sub x27, x27, #0x10\n"
- ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x90]\n"
".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
@@ -1664,16 +1664,16 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 133f\n"
"132:" // Height 4: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x8\n"
"ldr q26, [x10, #0x0]\n"
"ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x20]\n"
@@ -1774,24 +1774,24 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x28, x20\n"
"bne 126b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
@@ -1918,14 +1918,14 @@ void a64_hybrid_s8s32_mmla_6x16 (
"149:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"150:" // Height 5: Column loop
"tbz %x[flags], #0, 161f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x21, x22, x20, LSL #2\n"
"bge 159f\n"
"tbz x11, #3, 154f\n"
@@ -2123,8 +2123,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"mov x28, #0x0\n"
"163:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 164f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2161,51 +2161,51 @@ void a64_hybrid_s8s32_mmla_6x16 (
"166:" // Height 5: Multiply loop: Main loop head
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
+ ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "sub x27, x27, #0x10\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
- ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n"
".inst 0x4e80a454 // smmla v20.4s, v2.16b, v0.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x4e80a49c // smmla v28.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x4e87a4c9 // smmla v9.4s, v6.16b, v7.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n"
".inst 0x4e80a455 // smmla v21.4s, v2.16b, v0.16b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x50]\n"
".inst 0x4e87a4ca // smmla v10.4s, v6.16b, v7.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n"
".inst 0x4e80a456 // smmla v22.4s, v2.16b, v0.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e80a49e // smmla v30.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x70]\n"
".inst 0x4e87a4cb // smmla v11.4s, v6.16b, v7.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n"
".inst 0x4e80a457 // smmla v23.4s, v2.16b, v0.16b\n"
"ldr q2, [x25, #0x0]\n"
@@ -2251,47 +2251,47 @@ void a64_hybrid_s8s32_mmla_6x16 (
"167:" // Height 5: Multiply loop: Single iteration only
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
+ ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
- ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n"
".inst 0x4e80a454 // smmla v20.4s, v2.16b, v0.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e80a49c // smmla v28.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x4e87a4c9 // smmla v9.4s, v6.16b, v7.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n"
".inst 0x4e80a455 // smmla v21.4s, v2.16b, v0.16b\n"
+ "add x22, x22, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x50]\n"
".inst 0x4e87a4ca // smmla v10.4s, v6.16b, v7.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n"
".inst 0x4e80a456 // smmla v22.4s, v2.16b, v0.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e80a49e // smmla v30.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x70]\n"
".inst 0x4e87a4cb // smmla v11.4s, v6.16b, v7.16b\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
@@ -2335,24 +2335,24 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 170f\n"
"169:" // Height 5: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "cmp x27, #0x8\n"
"ldr d0, [x22], #0x8\n"
"ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v3.2d, v2.2d\n"
- "trn1 v2.2d, v0.2d, v5.2d\n"
- "ldr q0, [x10, #0x10]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x10, #0x20]\n"
".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ "cmp x27, #0x8\n"
".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
@@ -2371,8 +2371,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
"ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x4e86a48b // smmla v11.4s, v4.16b, v6.16b\n"
+ "add x10, x10, #0x80\n"
".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
".inst 0x4e86a45b // smmla v27.4s, v2.16b, v6.16b\n"
".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n"
@@ -2471,28 +2471,28 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x28, x20\n"
"bne 163b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "cmp x11, #0x10\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "uzp2 v11.2d, v11.2d, v15.2d\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
@@ -2640,20 +2640,19 @@ void a64_hybrid_s8s32_mmla_6x16 (
"b 224f\n"
"186:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"187:" // Height 6: Column loop
"tbz %x[flags], #0, 198f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x20, x21, x20, LSL #2\n"
"bge 196f\n"
"tbz x11, #3, 191f\n"
@@ -2871,8 +2870,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"mov x28, #0x0\n"
"200:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 201f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2913,56 +2912,56 @@ void a64_hybrid_s8s32_mmla_6x16 (
"203:" // Height 6: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x10, #0x10]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x50]\n"
".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x70]\n"
".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
"ldr q2, [x25, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
"ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
@@ -3006,52 +3005,52 @@ void a64_hybrid_s8s32_mmla_6x16 (
"204:" // Height 6: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
+ "add x25, x25, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x10, #0x10]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ "add x21, x21, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x50]\n"
".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x70]\n"
".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
@@ -3092,18 +3091,18 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 207f\n"
"206:" // Height 6: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d5, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x8\n"
- "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x22], #0x8\n"
"ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
"ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v3.2d\n"
- "trn1 v2.2d, v2.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
@@ -3197,9 +3196,9 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q0, [x10, #0x0]\n"
"trn1 v7.2d, v1.2d, v2.2d\n"
"trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e80a4e8 // smmla v8.4s, v7.16b, v0.16b\n"
"trn1 v2.2d, v5.2d, v6.2d\n"
"ldr q1, [x10, #0x10]\n"
- ".inst 0x4e80a4e8 // smmla v8.4s, v7.16b, v0.16b\n"
".inst 0x4e80a470 // smmla v16.4s, v3.16b, v0.16b\n"
".inst 0x4e80a458 // smmla v24.4s, v2.16b, v0.16b\n"
"ldr q0, [x10, #0x20]\n"
@@ -3223,8 +3222,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
".inst 0x4e81a476 // smmla v22.4s, v3.16b, v1.16b\n"
".inst 0x4e81a45e // smmla v30.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
+ "add x10, x10, #0x80\n"
".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n"
".inst 0x4e80a45b // smmla v27.4s, v2.16b, v0.16b\n"
".inst 0x4e86a4ef // smmla v15.4s, v7.16b, v6.16b\n"
@@ -3236,32 +3235,32 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x28, x20\n"
"bne 200b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "add x20, x21, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
+ "uzp2 v11.2d, v11.2d, v15.2d\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add x20, x21, x20, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
@@ -3440,8 +3439,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"224:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index b81e2e8593..926408855d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -71,7 +71,7 @@ public:
return false;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
index 90b196735a..b2cec742c4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
@@ -45,18 +45,18 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -84,133 +84,133 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"cmp %x[M], #0x2\n"
"bgt 61f\n"
"beq 31f\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v15.16b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
"3:" // Height 1: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 6f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
+ "add x9, x9, x20\n"
"b 6f\n"
"5:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
+ "mov x9, %x[input_ptr]\n"
"6:" // Height 1: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 11f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr d21, [x14, #0x70]\n"
- "ldr x20, [x14, #0x78]\n"
+ "ldr d21, [x12, #0x70]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr d20, [x14, #0x80]\n"
+ "ldr d20, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr d26, [x14, #0x90]\n"
+ "ldr d26, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr d25, [x14, #0xa0]\n"
+ "ldr d25, [x12, #0xa0]\n"
"mov v21.d[1], x20\n"
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x12, #0x88]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr d24, [x14, #0xb0]\n"
+ "ldr d24, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr d23, [x14, #0xc0]\n"
+ "ldr d23, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr d22, [x14, #0xd0]\n"
+ "ldr d22, [x12, #0xd0]\n"
".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n"
- "ldr d21, [x14, #0xe0]\n"
+ "ldr d21, [x12, #0xe0]\n"
"mov v20.d[1], x20\n"
- "ldr x22, [x14, #0x98]\n"
- "add x10, x10, #0x10\n"
- "ldr x21, [x14, #0xa8]\n"
+ "ldr x20, [x12, #0x98]\n"
+ "mov v26.d[1], x20\n"
+ "ldr x20, [x12, #0xa8]\n"
+ "mov v25.d[1], x20\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "mov v24.d[1], x20\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
- "ldr d20, [x14, #0xf0]\n"
- "ldr x20, [x14, #0xb8]\n"
- "mov v26.d[1], x22\n"
- "mov v25.d[1], x21\n"
- "ldr x23, [x14, #0xc8]\n"
- "ldr x22, [x14, #0xd8]\n"
+ "ldr d20, [x12, #0xf0]\n"
".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
- "mov v24.d[1], x20\n"
- "ldr x21, [x14, #0xe8]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
+ "ldr x20, [x12, #0xf8]\n"
"mov v23.d[1], x23\n"
"mov v22.d[1], x22\n"
- "add x14, x14, #0x100\n"
+ "add x9, x9, #0x10\n"
"mov v21.d[1], x21\n"
- ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
+ "add x12, x12, #0x100\n"
"mov v20.d[1], x20\n"
+ ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q4, [x14, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q4, [x12, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q21, [x14, #0x70]\n"
+ "ldr q21, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q20, [x14, #0x80]\n"
+ "ldr q20, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q26, [x14, #0x90]\n"
+ "ldr q26, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q25, [x14, #0xa0]\n"
+ "ldr q25, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q24, [x14, #0xb0]\n"
+ "ldr q24, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q23, [x14, #0xc0]\n"
+ "ldr q23, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q22, [x14, #0xd0]\n"
+ "ldr q22, [x12, #0xd0]\n"
".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n"
- "ldr q21, [x14, #0xe0]\n"
+ "ldr q21, [x12, #0xe0]\n"
".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
- "ldr q20, [x14, #0xf0]\n"
+ "ldr q20, [x12, #0xf0]\n"
".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
@@ -218,54 +218,54 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"tbnz %x[flags], #31, 10f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"10:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"11:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 18f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 18f\n"
+ "cmp x10, #0x4\n"
"blt 14f\n"
"12:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
"tbnz %x[flags], #31, 13f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q23, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q22, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q21, [x14, #0x20]\n"
- "ldr q20, [x14, #0x30]\n"
- ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q22, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q21, [x12, #0x20]\n"
+ ".inst 0x6f80e290 // udot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x11, 18f\n"
- "tbz x11, #1, 15f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 16f\n"
- "ld1 { v0.b }[2], [x10]\n"
+ "cbz x10, 18f\n"
+ "tbz x10, #1, 15f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 16f\n"
+ "ld1 { v0.b }[2], [x9]\n"
"b 16f\n"
"15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
"16:" // Height 1: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 17f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q23, [x14, #0x0]\n"
- "ldr q22, [x14, #0x10]\n"
- "ldr q21, [x14, #0x20]\n"
- "ldr q20, [x14, #0x30]\n"
- ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n"
- ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
- ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x0]\n"
+ ".inst 0x6f80e290 // udot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x10]\n"
+ ".inst 0x6f80e291 // udot v17.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x20]\n"
+ ".inst 0x6f80e292 // udot v18.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 4b\n"
"prfm pstl1keep, [x13, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
@@ -276,28 +276,28 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"addp v11.4s, v11.4s, v11.4s\n"
"mul v11.4s, v11.4s, v20.4s\n"
"19:" // Height 1: skip row sum fixup
- "ldr q24, [x16, #0x0]\n"
+ "ldr q23, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q23, [x16, #0x10]\n"
+ "ldr q22, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q22, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q21, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
+ "add v16.4s, v16.4s, v23.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v20.4s }, [x20]\n"
- "add v16.4s, v16.4s, v24.4s\n"
- "add v17.4s, v17.4s, v23.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v21.4s\n"
- "add x16, x16, #0x40\n"
"sqrdmulh v16.4s, v16.4s, v20.4s\n"
"sqrdmulh v17.4s, v17.4s, v20.4s\n"
"sqrdmulh v18.4s, v18.4s, v20.4s\n"
"sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 20f\n"
"and v23.16b, v16.16b, v0.16b\n"
"and v22.16b, v17.16b, v0.16b\n"
@@ -317,67 +317,67 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x20]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v20.4s\n"
+ "add v17.4s, v17.4s, v20.4s\n"
+ "add v18.4s, v18.4s, v20.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v21.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v20.4s }, [x20]\n"
- "add v16.4s, v16.4s, v22.4s\n"
- "add v17.4s, v17.4s, v22.4s\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v22.4s\n"
- "smin v16.4s, v16.4s, v21.4s\n"
- "smin v17.4s, v17.4s, v21.4s\n"
- "smin v18.4s, v18.4s, v21.4s\n"
- "smin v19.4s, v19.4s, v21.4s\n"
"smax v16.4s, v16.4s, v20.4s\n"
"smax v17.4s, v17.4s, v20.4s\n"
"smax v18.4s, v18.4s, v20.4s\n"
"smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 29f\n"
- "tbz x15, #3, 24f\n"
+ "tbz x14, #3, 24f\n"
"str d16, [x13], #0x8\n"
- "tbz x15, #2, 22f\n"
+ "tbz x14, #2, 22f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "tbz x15, #1, 21f\n"
+ "tbz x14, #1, 21f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[14], [x13]\n"
"b 28f\n"
"21:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[12], [x13]\n"
"b 28f\n"
"22:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 23f\n"
+ "tbz x14, #1, 23f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[10], [x13]\n"
"b 28f\n"
"23:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[8], [x13]\n"
"b 28f\n"
"24:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 26f\n"
+ "tbz x14, #2, 26f\n"
"str s16, [x13], #0x4\n"
- "tbz x15, #1, 25f\n"
+ "tbz x14, #1, 25f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[6], [x13]\n"
"b 28f\n"
"25:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[4], [x13]\n"
"b 28f\n"
"26:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 27f\n"
+ "tbz x14, #1, 27f\n"
"str h16, [x13], #0x2\n"
- "tbz x15, #0, 28f\n"
+ "tbz x14, #0, 28f\n"
"st1 { v16.b }[2], [x13]\n"
"b 28f\n"
"27:" // Height 1: Partial direct writeback: partial_1_0
@@ -388,18 +388,18 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
"30:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 2b\n"
"b 122f\n"
"31:" // Height 2
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v15.16b, #0x1\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -410,80 +410,80 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
"33:" // Height 2: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 35f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "cbnz x12, 36f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x11, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
"add x9, x9, x20\n"
+ "add x28, x28, x20\n"
"b 36f\n"
"35:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x9, x10, x21\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
"36:" // Height 2: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 41f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 39f\n"
"37:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr d25, [x14, #0x70]\n"
+ "ldr d25, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr x23, [x14, #0x88]\n"
+ "mov v25.d[1], x20\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr d24, [x14, #0x80]\n"
+ "ldr d24, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr d30, [x14, #0x90]\n"
+ "ldr d30, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr x22, [x14, #0x98]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr d29, [x14, #0xa0]\n"
- "ldr x21, [x14, #0xa8]\n"
+ "ldr d29, [x12, #0xa0]\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr d28, [x14, #0xb0]\n"
- "ldr x20, [x14, #0xb8]\n"
+ "ldr d28, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr d27, [x14, #0xc0]\n"
+ "ldr d27, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
"mov v24.d[1], x23\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr d26, [x14, #0xd0]\n"
+ "ldr d26, [x12, #0xd0]\n"
".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n"
"mov v30.d[1], x22\n"
".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n"
- "ldr d25, [x14, #0xe0]\n"
+ "ldr d25, [x12, #0xe0]\n"
"mov v29.d[1], x21\n"
- "ldr x23, [x14, #0xc8]\n"
+ "ldr x23, [x12, #0xc8]\n"
"mov v28.d[1], x20\n"
- "ldr x22, [x14, #0xd8]\n"
- "ldr x21, [x14, #0xe8]\n"
+ "ldr x22, [x12, #0xd8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n"
".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n"
- "ldr d24, [x14, #0xf0]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr d24, [x12, #0xf0]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
"mov v27.d[1], x23\n"
@@ -494,9 +494,9 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n"
"mov v24.d[1], x20\n"
".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n"
- "add x10, x10, #0x10\n"
"add x9, x9, #0x10\n"
- "add x14, x14, #0x100\n"
+ "add x28, x28, #0x10\n"
+ "add x12, x12, #0x100\n"
".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n"
".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n"
".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n"
@@ -509,53 +509,53 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"38:" // Height 2: Multiply loop: unique 5: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"bge 37b\n"
"39:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q25, [x14, #0x70]\n"
+ "ldr q25, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q24, [x14, #0x80]\n"
+ "ldr q24, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q30, [x14, #0x90]\n"
+ "ldr q30, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q29, [x14, #0xa0]\n"
+ "ldr q29, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q28, [x14, #0xb0]\n"
+ "ldr q28, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q27, [x14, #0xc0]\n"
+ "ldr q27, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q26, [x14, #0xd0]\n"
+ "ldr q26, [x12, #0xd0]\n"
".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n"
".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n"
- "ldr q25, [x14, #0xe0]\n"
+ "ldr q25, [x12, #0xe0]\n"
".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n"
".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n"
- "ldr q24, [x14, #0xf0]\n"
+ "ldr q24, [x12, #0xf0]\n"
".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n"
".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n"
@@ -573,29 +573,29 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"40:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
"prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"41:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 48f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 48f\n"
+ "cmp x10, #0x4\n"
"blt 44f\n"
"42:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
"tbnz %x[flags], #31, 43f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q27, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q26, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q25, [x14, #0x20]\n"
- "ldr q24, [x14, #0x30]\n"
+ "ldr q27, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q26, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q25, [x12, #0x20]\n"
".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
@@ -603,44 +603,44 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"bge 42b\n"
"44:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x11, 48f\n"
- "tbz x11, #1, 45f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "tbz x11, #0, 46f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x9]\n"
+ "cbz x10, 48f\n"
+ "tbz x10, #1, 45f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x10, #0, 46f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
"b 46f\n"
"45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
"46:" // Height 2: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 47f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q27, [x14, #0x0]\n"
- "ldr q26, [x14, #0x10]\n"
- "ldr q25, [x14, #0x20]\n"
- "ldr q24, [x14, #0x30]\n"
- ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n"
- ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
+ "ldr q24, [x12, #0x0]\n"
+ ".inst 0x6f80e310 // udot v16.4s, v24.16b, v0.4b[0]\n"
+ "ldr q26, [x12, #0x10]\n"
+ ".inst 0x6f81e314 // udot v20.4s, v24.16b, v1.4b[0]\n"
+ "ldr q25, [x12, #0x20]\n"
".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20\n"
"prfm pstl1keep, [x13, #0x0]\n"
- "add x24, x13, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -652,28 +652,28 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"mul v11.4s, v11.4s, v24.4s\n"
"mul v12.4s, v12.4s, v24.4s\n"
"49:" // Height 2: skip row sum fixup
- "ldr q28, [x16, #0x0]\n"
+ "ldr q27, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q27, [x16, #0x10]\n"
+ "ldr q26, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q26, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q25, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add v16.4s, v16.4s, v28.4s\n"
- "add v17.4s, v17.4s, v27.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
+ "add v16.4s, v16.4s, v27.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"ld1r { v24.4s }, [x20]\n"
- "add v19.4s, v19.4s, v25.4s\n"
- "add v20.4s, v20.4s, v28.4s\n"
- "add v21.4s, v21.4s, v27.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v25.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
@@ -685,31 +685,31 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"sqrdmulh v21.4s, v21.4s, v24.4s\n"
"sqrdmulh v22.4s, v22.4s, v24.4s\n"
"sqrdmulh v23.4s, v23.4s, v24.4s\n"
- "add x16, x16, #0x40\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 50f\n"
"and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
"and v30.16b, v17.16b, v0.16b\n"
"and v29.16b, v18.16b, v0.16b\n"
"and v28.16b, v19.16b, v0.16b\n"
"and v27.16b, v20.16b, v0.16b\n"
"and v26.16b, v21.16b, v0.16b\n"
"and v25.16b, v22.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v24.16b, v23.16b, v0.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v27.4s, v27.4s, #0x1f\n"
"sshr v26.4s, v26.4s, #0x1f\n"
"sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v24.4s\n"
- "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v17.4s, v17.4s, v30.4s\n"
"sqadd v18.4s, v18.4s, v29.4s\n"
"sqadd v19.4s, v19.4s, v28.4s\n"
"sqadd v20.4s, v20.4s, v27.4s\n"
"sqadd v21.4s, v21.4s, v26.4s\n"
"sqadd v22.4s, v22.4s, v25.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
@@ -721,28 +721,27 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x20]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v24.4s\n"
+ "add v18.4s, v18.4s, v24.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v24.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v24.4s\n"
+ "smin v18.4s, v18.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v24.4s\n"
+ "smin v22.4s, v22.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v24.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v25.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
- "add v16.4s, v16.4s, v26.4s\n"
- "add v17.4s, v17.4s, v26.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v26.4s\n"
- "add v20.4s, v20.4s, v26.4s\n"
- "add v21.4s, v21.4s, v26.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v26.4s\n"
- "smin v16.4s, v16.4s, v25.4s\n"
- "smin v17.4s, v17.4s, v25.4s\n"
- "smin v18.4s, v18.4s, v25.4s\n"
- "smin v19.4s, v19.4s, v25.4s\n"
- "smin v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v25.4s\n"
- "smin v22.4s, v22.4s, v25.4s\n"
- "smin v23.4s, v23.4s, v25.4s\n"
"smax v16.4s, v16.4s, v24.4s\n"
"smax v17.4s, v17.4s, v24.4s\n"
"smax v18.4s, v18.4s, v24.4s\n"
@@ -755,87 +754,88 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
"uzp1 v17.8h, v22.8h, v23.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v18.16b\n"
"uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 59f\n"
- "tbz x15, #3, 54f\n"
+ "tbz x14, #3, 54f\n"
"str d16, [x13], #0x8\n"
- "str d20, [x24], #0x8\n"
- "tbz x15, #2, 52f\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x14, #2, 52f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "tbz x15, #1, 51f\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x14, #1, 51f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[14], [x13]\n"
- "st1 { v20.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 58f\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[12], [x13]\n"
- "st1 { v20.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 53f\n"
+ "tbz x14, #1, 53f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[10], [x13]\n"
- "st1 { v20.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 58f\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[8], [x13]\n"
- "st1 { v20.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 56f\n"
+ "tbz x14, #2, 56f\n"
"str s16, [x13], #0x4\n"
- "str s20, [x24], #0x4\n"
- "tbz x15, #1, 55f\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x14, #1, 55f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[6], [x13]\n"
- "st1 { v20.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 58f\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[4], [x13]\n"
- "st1 { v20.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 57f\n"
+ "tbz x14, #1, 57f\n"
"str h16, [x13], #0x2\n"
- "str h20, [x24], #0x2\n"
- "tbz x15, #0, 58f\n"
+ "str h20, [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
"st1 { v16.b }[2], [x13]\n"
- "st1 { v20.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
"str b16, [x13, #0x0]\n"
- "str b20, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
- "str q20, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 32b\n"
"b 122f\n"
"61:" // Height 3
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v15.16b, #0x1\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -850,105 +850,105 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
"63:" // Height 3: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 65f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x28, [x20, #0x10]\n"
- "cbnz x12, 66f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "cbnz x11, 66f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
"add x9, x9, x20\n"
"add x28, x28, x20\n"
+ "add x27, x27, x20\n"
"b 66f\n"
"65:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x9, x10, x21\n"
+ "mov x9, %x[input_ptr]\n"
"add x28, x9, x21\n"
+ "add x27, x28, x21\n"
"66:" // Height 3: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 71f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 69f\n"
"67:" // Height 3: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x23, [x14, #0x88]\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr d29, [x14, #0x70]\n"
+ "ldr d29, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr x22, [x14, #0x98]\n"
+ "mov v29.d[1], x20\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x21, [x14, #0xa8]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr d28, [x14, #0x80]\n"
+ "ldr d28, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "mov v29.d[1], x20\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr d5, [x14, #0x90]\n"
+ "ldr d5, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
"mov v28.d[1], x23\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr x23, [x14, #0xc8]\n"
+ "mov v5.d[1], x22\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr d4, [x14, #0xa0]\n"
+ "ldr d4, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "mov v5.d[1], x22\n"
+ "mov v4.d[1], x21\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr x22, [x14, #0xd8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr d3, [x14, #0xb0]\n"
+ "ldr d3, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "mov v4.d[1], x21\n"
+ "mov v3.d[1], x20\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr x21, [x14, #0xe8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr d31, [x14, #0xc0]\n"
+ "ldr d31, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "mov v3.d[1], x20\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr d30, [x14, #0xd0]\n"
+ "ldr d30, [x12, #0xd0]\n"
".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n"
"mov v31.d[1], x23\n"
".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n"
- "add x10, x10, #0x10\n"
+ "mov v30.d[1], x22\n"
".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n"
- "ldr d29, [x14, #0xe0]\n"
+ "ldr d29, [x12, #0xe0]\n"
".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
- "mov v30.d[1], x22\n"
+ "mov v29.d[1], x21\n"
".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n"
"add x9, x9, #0x10\n"
".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n"
- "ldr d28, [x14, #0xf0]\n"
+ "ldr d28, [x12, #0xf0]\n"
".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n"
- "mov v29.d[1], x21\n"
+ "mov v28.d[1], x20\n"
".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n"
"add x28, x28, #0x10\n"
".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "add x27, x27, #0x10\n"
".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n"
".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n"
".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n"
@@ -971,65 +971,65 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"68:" // Height 3: Multiply loop: unique 9: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 67b\n"
"69:" // Height 3: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q29, [x14, #0x70]\n"
+ "ldr q29, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
- ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
"add x28, x28, #0x10\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "add x27, x27, #0x10\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q28, [x14, #0x80]\n"
+ "ldr q28, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q5, [x14, #0x90]\n"
+ "ldr q5, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q4, [x14, #0xa0]\n"
+ "ldr q4, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q3, [x14, #0xb0]\n"
+ "ldr q3, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q31, [x14, #0xc0]\n"
+ "ldr q31, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q30, [x14, #0xd0]\n"
+ "ldr q30, [x12, #0xd0]\n"
".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n"
".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n"
".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n"
- "ldr q29, [x14, #0xe0]\n"
+ "ldr q29, [x12, #0xe0]\n"
".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n"
".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n"
- "ldr q28, [x14, #0xf0]\n"
+ "ldr q28, [x12, #0xf0]\n"
".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n"
".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n"
@@ -1055,32 +1055,32 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"70:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"71:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 78f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 78f\n"
+ "cmp x10, #0x4\n"
"blt 74f\n"
"72:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x28], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
"tbnz %x[flags], #31, 73f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q31, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q30, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q29, [x14, #0x20]\n"
- "ldr q28, [x14, #0x30]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q30, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q29, [x12, #0x20]\n"
".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
@@ -1092,36 +1092,36 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n"
"bge 72b\n"
"74:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x11, 78f\n"
- "tbz x11, #1, 75f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x28], #0x2\n"
- "tbz x11, #0, 76f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x28]\n"
+ "cbz x10, 78f\n"
+ "tbz x10, #1, 75f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "tbz x10, #0, 76f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
"b 76f\n"
"75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x28, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
"76:" // Height 3: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 77f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q31, [x14, #0x0]\n"
- "ldr q30, [x14, #0x10]\n"
- "ldr q29, [x14, #0x20]\n"
- "ldr q28, [x14, #0x30]\n"
- ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
- ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
- ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ ".inst 0x6f80e390 // udot v16.4s, v28.16b, v0.4b[0]\n"
+ "ldr q30, [x12, #0x10]\n"
+ ".inst 0x6f81e394 // udot v20.4s, v28.16b, v1.4b[0]\n"
+ "ldr q29, [x12, #0x20]\n"
+ ".inst 0x6f82e398 // udot v24.4s, v28.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n"
@@ -1131,15 +1131,15 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n"
"78:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x13, #0x0]\n"
- "add x24, x13, x20\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -1154,13 +1154,13 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"mul v12.4s, v12.4s, v28.4s\n"
"mul v13.4s, v13.4s, v28.4s\n"
"79:" // Height 3: skip row sum fixup
- "ldr q31, [x16, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q30, [x16, #0x10]\n"
+ "ldr q30, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q29, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q28, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1183,11 +1183,10 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"add v26.4s, v26.4s, v29.4s\n"
"add v27.4s, v27.4s, v28.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v28.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
- "add x16, x16, #0x40\n"
"sqrdmulh v16.4s, v16.4s, v28.4s\n"
"sqrdmulh v17.4s, v17.4s, v28.4s\n"
"sqrdmulh v18.4s, v18.4s, v28.4s\n"
@@ -1200,38 +1199,39 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 80f\n"
"and v1.16b, v16.16b, v0.16b\n"
"and v31.16b, v17.16b, v0.16b\n"
"and v30.16b, v18.16b, v0.16b\n"
"and v29.16b, v19.16b, v0.16b\n"
"and v28.16b, v20.16b, v0.16b\n"
- "and v3.16b, v21.16b, v0.16b\n"
- "and v2.16b, v22.16b, v0.16b\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v31.4s, v31.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
"sqadd v16.4s, v16.4s, v1.4s\n"
"sqadd v17.4s, v17.4s, v31.4s\n"
"sqadd v18.4s, v18.4s, v30.4s\n"
"sqadd v19.4s, v19.4s, v29.4s\n"
"sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
"and v1.16b, v23.16b, v0.16b\n"
"and v31.16b, v24.16b, v0.16b\n"
"and v30.16b, v25.16b, v0.16b\n"
"and v29.16b, v26.16b, v0.16b\n"
"and v28.16b, v27.16b, v0.16b\n"
- "sqadd v21.4s, v21.4s, v3.4s\n"
- "sqadd v22.4s, v22.4s, v2.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v31.4s, v31.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v3.4s\n"
+ "sqadd v22.4s, v22.4s, v2.4s\n"
"sqadd v23.4s, v23.4s, v1.4s\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"sqadd v25.4s, v25.4s, v30.4s\n"
@@ -1251,36 +1251,35 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x20]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v28.4s\n"
+ "add v18.4s, v18.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v23.4s, v23.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v28.4s }, [x20]\n"
- "add v16.4s, v16.4s, v30.4s\n"
- "add v17.4s, v17.4s, v30.4s\n"
- "add v18.4s, v18.4s, v30.4s\n"
- "add v19.4s, v19.4s, v30.4s\n"
- "add v20.4s, v20.4s, v30.4s\n"
- "add v21.4s, v21.4s, v30.4s\n"
- "add v22.4s, v22.4s, v30.4s\n"
- "add v23.4s, v23.4s, v30.4s\n"
- "add v24.4s, v24.4s, v30.4s\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "add v27.4s, v27.4s, v30.4s\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v19.4s, v19.4s, v29.4s\n"
- "smin v20.4s, v20.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v29.4s\n"
- "smin v22.4s, v22.4s, v29.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v24.4s, v24.4s, v29.4s\n"
- "smin v25.4s, v25.4s, v29.4s\n"
- "smin v26.4s, v26.4s, v29.4s\n"
- "smin v27.4s, v27.4s, v29.4s\n"
"smax v16.4s, v16.4s, v28.4s\n"
"smax v17.4s, v17.4s, v28.4s\n"
"smax v18.4s, v18.4s, v28.4s\n"
@@ -1299,109 +1298,109 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
"uzp1 v17.8h, v26.8h, v27.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v18.16b\n"
"uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 89f\n"
- "tbz x15, #3, 84f\n"
+ "tbz x14, #3, 84f\n"
"str d16, [x13], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x15, #2, 82f\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x14, #2, 82f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x15, #1, 81f\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x14, #1, 81f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[14], [x13]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 88f\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[12], [x13]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 83f\n"
+ "tbz x14, #1, 83f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[10], [x13]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 88f\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[8], [x13]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 86f\n"
+ "tbz x14, #2, 86f\n"
"str s16, [x13], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x15, #1, 85f\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x14, #1, 85f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[6], [x13]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 88f\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[4], [x13]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 87f\n"
+ "tbz x14, #1, 87f\n"
"str h16, [x13], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x15, #0, 88f\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
"st1 { v16.b }[2], [x13]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
"str b16, [x13, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 62b\n"
"b 122f\n"
"91:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x4\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
"movi v12.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "madd x20, x21, x20, x13\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v14.4s, #0x0\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -1420,118 +1419,118 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
"93:" // Height 4: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
"tbz %x[flags], #3, 95f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x28, [x20, #0x10]\n"
- "ldr x27, [x20, #0x18]\n"
- "cbnz x12, 96f\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x26, [x20, #0x18]\n"
+ "cbnz x11, 96f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
"add x9, x9, x20\n"
"add x28, x28, x20\n"
"add x27, x27, x20\n"
+ "add x26, x26, x20\n"
"b 96f\n"
"95:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x9, x10, x21\n"
+ "mov x9, %x[input_ptr]\n"
"add x28, x9, x21\n"
"add x27, x28, x21\n"
+ "add x26, x27, x21\n"
"96:" // Height 4: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 101f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q3, [x27, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 99f\n"
"97:" // Height 4: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x21, [x14, #0x78]\n"
+ "ldr x22, [x12, #0x78]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x20, [x14, #0x88]\n"
+ "ldr x21, [x12, #0x88]\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr x26, [x14, #0x98]\n"
+ "ldr x20, [x12, #0x98]\n"
".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr d4, [x14, #0x70]\n"
+ "ldr d4, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr x25, [x14, #0xa8]\n"
+ "mov v4.d[1], x22\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x24, [x14, #0xb8]\n"
+ "ldr x25, [x12, #0xa8]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "mov v4.d[1], x21\n"
+ "ldr x24, [x12, #0xb8]\n"
".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr d5, [x14, #0x80]\n"
+ "ldr d5, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr x23, [x14, #0xc8]\n"
+ "mov v5.d[1], x21\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x22, [x14, #0xd8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "mov v5.d[1], x20\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x14, #0x90]\n"
+ "ldr d6, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr x21, [x14, #0xe8]\n"
+ "mov v6.d[1], x20\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x26\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x14, #0xa0]\n"
+ "ldr d7, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "add x10, x10, #0x10\n"
+ "mov v7.d[1], x25\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
"add x9, x9, #0x10\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "mov v7.d[1], x25\n"
+ "add x28, x28, #0x10\n"
".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr d8, [x14, #0xb0]\n"
+ "ldr d8, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "add x28, x28, #0x10\n"
+ "mov v8.d[1], x24\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
"add x27, x27, #0x10\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "mov v8.d[1], x24\n"
+ "add x26, x26, #0x10\n"
".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr d9, [x14, #0xc0]\n"
+ "ldr d9, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "mov v9.d[1], x23\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "mov v9.d[1], x23\n"
".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr d10, [x14, #0xd0]\n"
+ "ldr d10, [x12, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ "mov v10.d[1], x22\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
- "mov v10.d[1], x22\n"
".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr d4, [x14, #0xe0]\n"
+ "ldr d4, [x12, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ "mov v4.d[1], x21\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
- "mov v4.d[1], x21\n"
".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr d5, [x14, #0xf0]\n"
+ "ldr d5, [x12, #0xf0]\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "mov v5.d[1], x20\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
- "mov v5.d[1], x20\n"
".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
@@ -1563,77 +1562,77 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"98:" // Height 4: Multiply loop: unique 13: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x28, #0x0]\n"
- "ldr q3, [x27, #0x0]\n"
- "ldr q4, [x14, #0x0]\n"
- "ldr q5, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
- "ldr q7, [x14, #0x30]\n"
- "ldr q8, [x14, #0x40]\n"
- "ldr q9, [x14, #0x50]\n"
- "ldr q10, [x14, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
"prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 97b\n"
"99:" // Height 4: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
"add x9, x9, #0x10\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q4, [x14, #0x70]\n"
+ "ldr q4, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
"add x27, x27, #0x10\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x14, #0x80]\n"
+ "ldr q5, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x90]\n"
+ "ldr q6, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0xa0]\n"
+ "ldr q7, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x14, #0xb0]\n"
+ "ldr q8, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x14, #0xc0]\n"
+ "ldr q9, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x14, #0xd0]\n"
+ "ldr q10, [x12, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x14, #0xe0]\n"
+ "ldr q4, [x12, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x14, #0xf0]\n"
+ "ldr q5, [x12, #0xf0]\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x14, x14, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1667,35 +1666,35 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"100:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
"prfm pldl1keep, [x9, #0x80]\n"
"prfm pldl1keep, [x28, #0x80]\n"
"prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"101:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 108f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 108f\n"
+ "cmp x10, #0x4\n"
"blt 104f\n"
"102:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x28], #0x4\n"
- "ldr s3, [x27], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
"tbnz %x[flags], #31, 103f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q7, [x14, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q6, [x14, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q5, [x14, #0x20]\n"
- "ldr q4, [x14, #0x30]\n"
+ "ldr q7, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q5, [x12, #0x20]\n"
".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
@@ -1711,23 +1710,23 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
"bge 102b\n"
"104:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x11, 108f\n"
- "tbz x11, #1, 105f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x28], #0x2\n"
- "ldr h3, [x27], #0x2\n"
- "tbz x11, #0, 106f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x28]\n"
- "ld1 { v3.b }[2], [x27]\n"
+ "cbz x10, 108f\n"
+ "tbz x10, #1, 105f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v3.b }[2], [x26]\n"
"b 106f\n"
"105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x28, #0x0]\n"
- "ldr b3, [x27, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
+ "ldr b3, [x26, #0x0]\n"
"106:" // Height 4: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 107f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
@@ -1735,16 +1734,16 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q7, [x14, #0x0]\n"
- "ldr q6, [x14, #0x10]\n"
- "ldr q5, [x14, #0x20]\n"
- "ldr q4, [x14, #0x30]\n"
+ "ldr q7, [x12, #0x0]\n"
".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x12, #0x10]\n"
".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
- "add x14, x14, #0x40\n"
+ "ldr q5, [x12, #0x20]\n"
".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
@@ -1758,17 +1757,17 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
"108:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "add x24, x13, x20\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x13, x20\n"
"add x22, x23, x20\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -1776,9 +1775,9 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"addp v14.4s, v14.4s, v14.4s\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1r { v0.4s }, [x20]\n"
+ "neg v0.4s, v0.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "neg v0.4s, v0.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
"mul v11.4s, v11.4s, v0.4s\n"
@@ -1786,13 +1785,13 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"mul v13.4s, v13.4s, v0.4s\n"
"mul v14.4s, v14.4s, v0.4s\n"
"109:" // Height 4: skip row sum fixup
- "ldr q3, [x16, #0x0]\n"
+ "ldr q3, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q2, [x16, #0x10]\n"
+ "ldr q2, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q1, [x16, #0x20]\n"
+ "ldr q1, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q0, [x16, #0x30]\n"
+ "ldr q0, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1823,11 +1822,10 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"add v30.4s, v30.4s, v1.4s\n"
"add v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v1.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1r { v0.4s }, [x20]\n"
- "add x16, x16, #0x40\n"
"sqrdmulh v16.4s, v16.4s, v1.4s\n"
"sqrdmulh v17.4s, v17.4s, v1.4s\n"
"sqrdmulh v18.4s, v18.4s, v1.4s\n"
@@ -1844,51 +1842,52 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 110f\n"
"and v2.16b, v16.16b, v0.16b\n"
"and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
"and v7.16b, v18.16b, v0.16b\n"
"and v6.16b, v19.16b, v0.16b\n"
"and v5.16b, v20.16b, v0.16b\n"
"and v4.16b, v21.16b, v0.16b\n"
"and v3.16b, v22.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v2.4s\n"
- "sqadd v17.4s, v17.4s, v1.4s\n"
- "and v2.16b, v23.16b, v0.16b\n"
- "and v1.16b, v24.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
"sqadd v18.4s, v18.4s, v7.4s\n"
"sqadd v19.4s, v19.4s, v6.4s\n"
"sqadd v20.4s, v20.4s, v5.4s\n"
"sqadd v21.4s, v21.4s, v4.4s\n"
"sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
"and v7.16b, v25.16b, v0.16b\n"
"and v6.16b, v26.16b, v0.16b\n"
"and v5.16b, v27.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"and v4.16b, v28.16b, v0.16b\n"
"and v3.16b, v29.16b, v0.16b\n"
+ "and v2.16b, v30.16b, v0.16b\n"
+ "and v1.16b, v31.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "sqadd v24.4s, v24.4s, v1.4s\n"
- "and v2.16b, v30.16b, v0.16b\n"
- "and v1.16b, v31.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
"sqadd v25.4s, v25.4s, v7.4s\n"
"sqadd v26.4s, v26.4s, v6.4s\n"
"sqadd v27.4s, v27.4s, v5.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v4.4s\n"
"sqadd v29.4s, v29.4s, v3.4s\n"
"sqadd v30.4s, v30.4s, v2.4s\n"
@@ -1911,44 +1910,43 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v30.4s, v30.4s, v0.4s\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[maxval]\n"
- "ld1r { v2.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v0.4s\n"
+ "add v18.4s, v18.4s, v0.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v0.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1r { v1.4s }, [x21]\n"
- "cmp x15, #0x10\n"
"ld1r { v0.4s }, [x20]\n"
- "add v16.4s, v16.4s, v2.4s\n"
- "add v17.4s, v17.4s, v2.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v2.4s\n"
- "add v20.4s, v20.4s, v2.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v2.4s\n"
- "add v24.4s, v24.4s, v2.4s\n"
- "add v25.4s, v25.4s, v2.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v2.4s\n"
- "add v28.4s, v28.4s, v2.4s\n"
- "add v29.4s, v29.4s, v2.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v2.4s\n"
- "smin v16.4s, v16.4s, v1.4s\n"
- "smin v17.4s, v17.4s, v1.4s\n"
- "smin v18.4s, v18.4s, v1.4s\n"
- "smin v19.4s, v19.4s, v1.4s\n"
- "smin v20.4s, v20.4s, v1.4s\n"
- "smin v21.4s, v21.4s, v1.4s\n"
- "smin v22.4s, v22.4s, v1.4s\n"
- "smin v23.4s, v23.4s, v1.4s\n"
- "smin v24.4s, v24.4s, v1.4s\n"
- "smin v25.4s, v25.4s, v1.4s\n"
- "smin v26.4s, v26.4s, v1.4s\n"
- "smin v27.4s, v27.4s, v1.4s\n"
- "smin v28.4s, v28.4s, v1.4s\n"
- "smin v29.4s, v29.4s, v1.4s\n"
- "smin v30.4s, v30.4s, v1.4s\n"
- "smin v31.4s, v31.4s, v1.4s\n"
"smax v16.4s, v16.4s, v0.4s\n"
"smax v17.4s, v17.4s, v0.4s\n"
"smax v18.4s, v18.4s, v0.4s\n"
@@ -1973,109 +1971,110 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
"uzp1 v17.8h, v30.8h, v31.8h\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v0.16b\n"
"uzp1 v20.16b, v20.16b, v19.16b\n"
"uzp1 v24.16b, v24.16b, v18.16b\n"
"uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 119f\n"
- "tbz x15, #3, 114f\n"
+ "tbz x14, #3, 114f\n"
"str d16, [x13], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
- "tbz x15, #2, 112f\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x14, #2, 112f\n"
"st1 { v16.s }[2], [x13], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
- "tbz x15, #1, 111f\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x14, #1, 111f\n"
"st1 { v16.h }[6], [x13], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[14], [x13]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 118f\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[12], [x13]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 113f\n"
+ "tbz x14, #1, 113f\n"
"st1 { v16.h }[4], [x13], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[10], [x13]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 118f\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[8], [x13]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 116f\n"
+ "tbz x14, #2, 116f\n"
"str s16, [x13], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
- "tbz x15, #1, 115f\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x14, #1, 115f\n"
"st1 { v16.h }[2], [x13], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[6], [x13]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 118f\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[4], [x13]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 117f\n"
+ "tbz x14, #1, 117f\n"
"str h16, [x13], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
- "tbz x15, #0, 118f\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
"st1 { v16.b }[2], [x13]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
"str b16, [x13, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
"str q16, [x13, #0x0]\n"
"add x13, x13, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 92b\n"
"subs %x[M], %x[M], #0x4\n"
"beq 122f\n"
@@ -2089,9 +2088,9 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
index 23315f3c0c..f07902a559 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -45,18 +45,18 @@ void a64_hybrid_u8qa_dot_4x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -90,7 +90,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -100,8 +100,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -134,7 +134,6 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q26, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
"ldr q25, [x28, #0xa0]\n"
- "add x24, x24, #0x10\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
"ldr q24, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
@@ -145,10 +144,11 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q21, [x28, #0xe0]\n"
".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
"ldr q20, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
+ "add x28, x28, #0x100\n"
".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
@@ -159,9 +159,9 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q4, [x28, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q5, [x28, #0x10]\n"
"ldr q6, [x28, #0x20]\n"
- "cmp x25, #0x20\n"
"ldr q7, [x28, #0x30]\n"
"ldr q8, [x28, #0x40]\n"
"ldr q9, [x28, #0x50]\n"
@@ -177,8 +177,6 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q26, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
"ldr q25, [x28, #0xa0]\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
"ldr q24, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
@@ -189,10 +187,12 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q21, [x28, #0xe0]\n"
".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
"ldr q20, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
+ "sub x25, x25, #0x10\n"
".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
+ "add x24, x24, #0x10\n"
+ "add x28, x28, #0x100\n"
".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
@@ -213,14 +213,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q23, [x28, #0x0]\n"
"ldr q22, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q21, [x28, #0x20]\n"
"ldr q20, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n"
".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n"
".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
"cbz x25, 18f\n"
@@ -235,15 +235,15 @@ void a64_hybrid_u8qa_dot_4x16 (
"tbnz %x[flags], #31, 17f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q23, [x28, #0x0]\n"
- "ldr q22, [x28, #0x10]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ ".inst 0x6f80e2b0 // udot v16.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f80e291 // udot v17.4s, v20.16b, v0.4b[0]\n"
"ldr q21, [x28, #0x20]\n"
"ldr q20, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n"
- ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n"
".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -252,8 +252,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"prfm pstl1keep, [x27, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
"add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
"ld1r { v20.4s }, [x20]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
"neg v20.4s, v20.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"mul v11.4s, v11.4s, v20.4s\n"
@@ -267,16 +267,16 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v20.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add v16.4s, v16.4s, v24.4s\n"
"add v17.4s, v17.4s, v23.4s\n"
+ "add v18.4s, v18.4s, v22.4s\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "add x10, x10, #0x40\n"
"ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v22.4s\n"
"add v19.4s, v19.4s, v21.4s\n"
"sqrdmulh v16.4s, v16.4s, v20.4s\n"
+ "add x10, x10, #0x40\n"
"sqrdmulh v17.4s, v17.4s, v20.4s\n"
"sqrdmulh v18.4s, v18.4s, v20.4s\n"
"sqrdmulh v19.4s, v19.4s, v20.4s\n"
@@ -294,21 +294,21 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqadd v18.4s, v18.4s, v21.4s\n"
"sqadd v19.4s, v19.4s, v20.4s\n"
"20:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x21]\n"
- "ld1r { v21.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v20.4s }, [x20]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add v16.4s, v16.4s, v22.4s\n"
"add v17.4s, v17.4s, v22.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
"add v18.4s, v18.4s, v22.4s\n"
"add v19.4s, v19.4s, v22.4s\n"
+ "cmp x9, #0x10\n"
"smin v16.4s, v16.4s, v21.4s\n"
"smin v17.4s, v17.4s, v21.4s\n"
"smin v18.4s, v18.4s, v21.4s\n"
@@ -381,7 +381,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -395,8 +395,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"mov x26, #0x0\n"
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 35f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -476,9 +476,9 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q1, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q4, [x28, #0x0]\n"
"ldr q5, [x28, #0x10]\n"
- "cmp x25, #0x20\n"
"ldr q6, [x28, #0x20]\n"
"ldr q7, [x28, #0x30]\n"
"ldr q8, [x28, #0x40]\n"
@@ -553,14 +553,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q27, [x28, #0x0]\n"
"ldr q26, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q25, [x28, #0x20]\n"
"ldr q24, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n"
".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n"
".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
@@ -583,17 +583,17 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q27, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x0]\n"
"ldr q26, [x28, #0x10]\n"
+ ".inst 0x6f80e310 // udot v16.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e314 // udot v20.4s, v24.16b, v1.4b[0]\n"
"ldr q25, [x28, #0x20]\n"
"ldr q24, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n"
- ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n"
".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
@@ -602,14 +602,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"cmp x26, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v24.4s }, [x20]\n"
"neg v24.4s, v24.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
@@ -627,10 +627,10 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v24.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v16.4s, v16.4s, v28.4s\n"
"add v17.4s, v17.4s, v27.4s\n"
@@ -652,45 +652,45 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqrdmulh v23.4s, v23.4s, v24.4s\n"
"tbz %x[flags], #5, 50f\n"
"and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
"and v30.16b, v17.16b, v0.16b\n"
"and v29.16b, v18.16b, v0.16b\n"
"and v28.16b, v19.16b, v0.16b\n"
"and v27.16b, v20.16b, v0.16b\n"
"and v26.16b, v21.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"and v25.16b, v22.16b, v0.16b\n"
+ "and v24.16b, v23.16b, v0.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v24.4s\n"
- "and v24.16b, v23.16b, v0.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
"sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v17.4s, v17.4s, v30.4s\n"
"sqadd v18.4s, v18.4s, v29.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v19.4s, v19.4s, v28.4s\n"
"sqadd v20.4s, v20.4s, v27.4s\n"
"sqadd v21.4s, v21.4s, v26.4s\n"
"sqadd v22.4s, v22.4s, v25.4s\n"
"sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"add v16.4s, v16.4s, v26.4s\n"
"add v17.4s, v17.4s, v26.4s\n"
"add v18.4s, v18.4s, v26.4s\n"
@@ -724,68 +724,68 @@ void a64_hybrid_u8qa_dot_4x16 (
"bge 59f\n"
"tbz x9, #3, 54f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x9, #2, 52f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x9, #1, 51f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 58f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 53f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 58f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 56f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 58f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 57f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 32b\n"
@@ -799,7 +799,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -817,8 +817,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"mov x26, #0x0\n"
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 65f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -920,9 +920,9 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q1, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q2, [x22, #0x0]\n"
"ldr q4, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q5, [x28, #0x10]\n"
"ldr q6, [x28, #0x20]\n"
"ldr q7, [x28, #0x30]\n"
@@ -1020,14 +1020,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q31, [x28, #0x0]\n"
"ldr q30, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q29, [x28, #0x20]\n"
"ldr q28, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
@@ -1060,15 +1060,15 @@ void a64_hybrid_u8qa_dot_4x16 (
"77:" // Height 3: Multiply loop: unique 12: skip row sum
"ldr q31, [x28, #0x0]\n"
"ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q28, [x28, #0x30]\n"
".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n"
".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n"
@@ -1081,16 +1081,16 @@ void a64_hybrid_u8qa_dot_4x16 (
"cmp x26, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v28.4s }, [x20]\n"
"addp v13.4s, v13.4s, v13.4s\n"
"neg v28.4s, v28.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
@@ -1111,10 +1111,10 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v28.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
@@ -1152,18 +1152,18 @@ void a64_hybrid_u8qa_dot_4x16 (
"and v30.16b, v18.16b, v0.16b\n"
"and v29.16b, v19.16b, v0.16b\n"
"and v28.16b, v20.16b, v0.16b\n"
- "and v3.16b, v21.16b, v0.16b\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v31.4s, v31.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "and v2.16b, v22.16b, v0.16b\n"
"sqadd v16.4s, v16.4s, v1.4s\n"
"sqadd v17.4s, v17.4s, v31.4s\n"
"sqadd v18.4s, v18.4s, v30.4s\n"
"sqadd v19.4s, v19.4s, v29.4s\n"
"sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
"and v1.16b, v23.16b, v0.16b\n"
"and v31.16b, v24.16b, v0.16b\n"
"and v30.16b, v25.16b, v0.16b\n"
@@ -1184,21 +1184,21 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqadd v26.4s, v26.4s, v29.4s\n"
"sqadd v27.4s, v27.4s, v28.4s\n"
"80:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x21]\n"
- "ld1r { v29.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v28.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v24.4s, v24.4s, v0.4s\n"
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
@@ -1251,103 +1251,102 @@ void a64_hybrid_u8qa_dot_4x16 (
"bge 89f\n"
"tbz x9, #3, 84f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 82f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 81f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 88f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 83f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 88f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 86f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 85f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 88f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 87f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 62b\n"
"b 122f\n"
"91:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -1369,8 +1368,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"mov x26, #0x0\n"
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 95f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1494,9 +1493,9 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q0, [x24, #0x0]\n"
"ldr q1, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q2, [x22, #0x0]\n"
"ldr q3, [x21, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q4, [x28, #0x0]\n"
"ldr q5, [x28, #0x10]\n"
"ldr q6, [x28, #0x20]\n"
@@ -1617,14 +1616,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"ldr q7, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ldr q5, [x28, #0x20]\n"
"ldr q4, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
@@ -1665,15 +1664,15 @@ void a64_hybrid_u8qa_dot_4x16 (
"107:" // Height 4: Multiply loop: unique 16: skip row sum
"ldr q7, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ "ldr q4, [x28, #0x30]\n"
".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
@@ -1690,18 +1689,18 @@ void a64_hybrid_u8qa_dot_4x16 (
"cmp x26, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20\n"
+ "add x21, x22, x20\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v0.4s }, [x20]\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
"neg v0.4s, v0.4s\n"
@@ -1725,10 +1724,10 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v1.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
@@ -1775,32 +1774,32 @@ void a64_hybrid_u8qa_dot_4x16 (
"tbz %x[flags], #5, 110f\n"
"and v2.16b, v16.16b, v0.16b\n"
"and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
"and v7.16b, v18.16b, v0.16b\n"
"and v6.16b, v19.16b, v0.16b\n"
"and v5.16b, v20.16b, v0.16b\n"
"and v4.16b, v21.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"and v3.16b, v22.16b, v0.16b\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v2.4s\n"
- "sqadd v17.4s, v17.4s, v1.4s\n"
- "and v2.16b, v23.16b, v0.16b\n"
- "and v1.16b, v24.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v7.4s\n"
- "sqadd v19.4s, v19.4s, v6.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
"sqadd v20.4s, v20.4s, v5.4s\n"
"sqadd v21.4s, v21.4s, v4.4s\n"
"sqadd v22.4s, v22.4s, v3.4s\n"
- "and v7.16b, v25.16b, v0.16b\n"
"sqadd v23.4s, v23.4s, v2.4s\n"
"sqadd v24.4s, v24.4s, v1.4s\n"
+ "and v7.16b, v25.16b, v0.16b\n"
"and v6.16b, v26.16b, v0.16b\n"
"and v5.16b, v27.16b, v0.16b\n"
"and v4.16b, v28.16b, v0.16b\n"
@@ -1822,21 +1821,21 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqadd v30.4s, v30.4s, v2.4s\n"
"sqadd v31.4s, v31.4s, v1.4s\n"
"110:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v3.4s }, [x21]\n"
- "ld1r { v2.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v1.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v24.4s, v24.4s, v0.4s\n"
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
@@ -1908,100 +1907,100 @@ void a64_hybrid_u8qa_dot_4x16 (
"bge 119f\n"
"tbz x9, #3, 114f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x9, #2, 112f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x9, #1, 111f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 118f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 113f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 118f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 116f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x9, #1, 115f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 118f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 117f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 92b\n"
@@ -2017,8 +2016,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
index 84f6ed0553..f8c7f0a549 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 16, 8> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
index 2d3af7f9c3..9103ef59ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
@@ -45,18 +45,18 @@ void a64_hybrid_u8qa_mmla_4x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -90,7 +90,7 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -104,8 +104,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -130,7 +130,6 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q4, [x28, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
- "add x24, x24, #0x10\n"
"trn1 v0.2d, v1.2d, v27.2d\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
@@ -152,8 +151,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
+ "add x28, x28, #0x100\n"
".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
@@ -166,9 +166,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q5, [x28, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q6, [x28, #0x10]\n"
"ldr q7, [x28, #0x20]\n"
- "cmp x25, #0x20\n"
"ldr q8, [x28, #0x30]\n"
"ldr q9, [x28, #0x40]\n"
"ldr q10, [x28, #0x50]\n"
@@ -176,12 +176,10 @@ void a64_hybrid_u8qa_mmla_4x16 (
"prfm pldl1keep, [x24, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
"trn1 v0.2d, v1.2d, v24.2d\n"
- "trn2 v1.2d, v1.2d, v24.2d\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v24.2d\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
"ldr q24, [x28, #0x80]\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
@@ -198,9 +196,11 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q25, [x28, #0xe0]\n"
".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
+ "sub x25, x25, #0x10\n"
".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
+ "add x24, x24, #0x10\n"
+ "add x28, x28, #0x100\n"
".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
@@ -222,24 +222,24 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
"ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
+ ".inst 0x6e9aa414 // ummla v20.4s, v0.16b, v26.16b\n"
"ldr q27, [x28, #0x40]\n"
"ldr q26, [x28, #0x50]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n"
".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n"
".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
"cbz x25, 20f\n"
@@ -267,23 +267,23 @@ void a64_hybrid_u8qa_mmla_4x16 (
"tbnz %x[flags], #31, 19f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"19:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x6e99a410 // ummla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a414 // ummla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x6e99a412 // ummla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a416 // ummla v22.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n"
".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"20:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -297,8 +297,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov v23.16b, v16.16b\n"
"tbnz %x[flags], #31, 21f\n"
"add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
"ld1r { v16.4s }, [x20]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
"neg v16.4s, v16.4s\n"
"dup v11.4s, v11.s[0]\n"
"mul v11.4s, v11.4s, v16.4s\n"
@@ -312,16 +312,16 @@ void a64_hybrid_u8qa_mmla_4x16 (
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v16.4s }, [x20]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add v23.4s, v23.4s, v24.4s\n"
"add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "add x10, x10, #0x40\n"
"ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v21.4s\n"
"add v19.4s, v19.4s, v20.4s\n"
"sqrdmulh v23.4s, v23.4s, v16.4s\n"
+ "add x10, x10, #0x40\n"
"sqrdmulh v17.4s, v17.4s, v16.4s\n"
"sqrdmulh v18.4s, v18.4s, v16.4s\n"
"sqrdmulh v19.4s, v19.4s, v16.4s\n"
@@ -339,21 +339,21 @@ void a64_hybrid_u8qa_mmla_4x16 (
"sqadd v18.4s, v18.4s, v20.4s\n"
"sqadd v19.4s, v19.4s, v16.4s\n"
"22:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v21.4s }, [x21]\n"
- "ld1r { v20.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v16.4s }, [x20]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
"add v23.4s, v23.4s, v21.4s\n"
"add v17.4s, v17.4s, v21.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add v18.4s, v18.4s, v21.4s\n"
"add v19.4s, v19.4s, v21.4s\n"
+ "cmp x9, #0x10\n"
"smin v23.4s, v23.4s, v20.4s\n"
"smin v17.4s, v17.4s, v20.4s\n"
"smin v18.4s, v18.4s, v20.4s\n"
@@ -426,7 +426,7 @@ void a64_hybrid_u8qa_mmla_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"34:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -440,8 +440,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov x26, #0x0\n"
"36:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 37f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -471,13 +471,11 @@ void a64_hybrid_u8qa_mmla_4x16 (
"blt 41f\n"
"39:" // Height 2: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
"ldr q24, [x28, #0x80]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
"ldr q30, [x28, #0x90]\n"
".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
@@ -493,9 +491,11 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
+ "add x28, x28, #0x100\n"
".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n"
@@ -507,9 +507,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q2, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q5, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
- "cmp x25, #0x20\n"
"ldr q7, [x28, #0x20]\n"
"ldr q8, [x28, #0x30]\n"
"ldr q9, [x28, #0x40]\n"
@@ -520,14 +520,11 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bge 39b\n"
"41:" // Height 2: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"ldr q25, [x28, #0x70]\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
"ldr q24, [x28, #0x80]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
"ldr q30, [x28, #0x90]\n"
".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
@@ -542,11 +539,14 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q25, [x28, #0xe0]\n"
".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
"ldr q24, [x28, #0xf0]\n"
+ "sub x25, x25, #0x10\n"
".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
+ "add x28, x28, #0x100\n"
".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n"
".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n"
@@ -568,24 +568,24 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"45:" // Height 2: Multiply loop: unique 7: skip row sum
"ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
+ ".inst 0x6e9aa414 // ummla v20.4s, v0.16b, v26.16b\n"
"ldr q27, [x28, #0x40]\n"
"ldr q26, [x28, #0x50]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n"
".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n"
".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"bge 44b\n"
"46:" // Height 2: Multiply loop: Skip odd blocks
"cbz x25, 52f\n"
@@ -620,23 +620,23 @@ void a64_hybrid_u8qa_mmla_4x16 (
"tbnz %x[flags], #31, 51f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"51:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x6e99a410 // ummla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a414 // ummla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x6e99a412 // ummla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a416 // ummla v22.4s, v0.16b, v24.16b\n"
"ldr q25, [x28, #0x60]\n"
- ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"ldr q24, [x28, #0x70]\n"
- ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n"
".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
+ "add x28, x28, #0x80\n"
"52:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -644,21 +644,21 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bne 36b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v24.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"mov v23.16b, v24.16b\n"
"tbnz %x[flags], #31, 53f\n"
"add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
"ld1r { v24.4s }, [x20]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
"neg v24.4s, v24.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
@@ -676,10 +676,10 @@ void a64_hybrid_u8qa_mmla_4x16 (
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v24.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v23.4s, v23.4s, v28.4s\n"
"add v20.4s, v20.4s, v27.4s\n"
@@ -701,45 +701,45 @@ void a64_hybrid_u8qa_mmla_4x16 (
"sqrdmulh v19.4s, v19.4s, v24.4s\n"
"tbz %x[flags], #5, 54f\n"
"and v24.16b, v23.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"and v30.16b, v20.16b, v0.16b\n"
"and v29.16b, v21.16b, v0.16b\n"
"and v28.16b, v22.16b, v0.16b\n"
"and v27.16b, v16.16b, v0.16b\n"
"and v26.16b, v17.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"and v25.16b, v18.16b, v0.16b\n"
+ "and v24.16b, v19.16b, v0.16b\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v24.4s\n"
- "and v24.16b, v19.16b, v0.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
"sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v20.4s, v20.4s, v30.4s\n"
"sqadd v21.4s, v21.4s, v29.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
"sqadd v22.4s, v22.4s, v28.4s\n"
"sqadd v16.4s, v16.4s, v27.4s\n"
"sqadd v17.4s, v17.4s, v26.4s\n"
"sqadd v18.4s, v18.4s, v25.4s\n"
"sqadd v19.4s, v19.4s, v24.4s\n"
"54:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v24.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"add v23.4s, v23.4s, v26.4s\n"
"add v20.4s, v20.4s, v26.4s\n"
"add v21.4s, v21.4s, v26.4s\n"
@@ -773,68 +773,68 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bge 63f\n"
"tbz x9, #3, 58f\n"
"str d23, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x9, #2, 56f\n"
"st1 { v23.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v23.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
"b 62f\n"
"55:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 62f\n"
"st1 { v23.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
"b 62f\n"
"56:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 57f\n"
"st1 { v23.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
"b 62f\n"
"57:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 62f\n"
"st1 { v23.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
"b 62f\n"
"58:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 60f\n"
"str s23, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
"tbz x9, #1, 59f\n"
"st1 { v23.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
"b 62f\n"
"59:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 62f\n"
"st1 { v23.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
"b 62f\n"
"60:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 61f\n"
"str h23, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
"b 62f\n"
"61:" // Height 2: Partial direct writeback: partial_1_0
"str b23, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
"62:" // Height 2: Partial direct writeback: Done
"b 64f\n"
"63:" // Height 2: Full writeback
"str q23, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
"64:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 34b\n"
@@ -848,7 +848,7 @@ void a64_hybrid_u8qa_mmla_4x16 (
"movi v15.16b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"66:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -870,8 +870,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov x26, #0x0\n"
"68:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 69f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -905,35 +905,35 @@ void a64_hybrid_u8qa_mmla_4x16 (
"71:" // Height 3: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q14, [x28, #0x60]\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
+ "trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
+ ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n"
"ldr q4, [x28, #0x80]\n"
- ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
+ ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
- ".inst 0x6e8ea413 // ummla v19.4s, v0.16b, v14.16b\n"
- ".inst 0x6e8ea45b // ummla v27.4s, v2.16b, v14.16b\n"
+ ".inst 0x6e85a413 // ummla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a45b // ummla v27.4s, v2.16b, v5.16b\n"
"ldr q6, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e8ea417 // ummla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x6e8ea45f // ummla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n"
@@ -962,9 +962,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q2, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q3, [x22, #0x0]\n"
"ldr q5, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q6, [x28, #0x10]\n"
"ldr q7, [x28, #0x20]\n"
"ldr q8, [x28, #0x30]\n"
@@ -977,36 +977,36 @@ void a64_hybrid_u8qa_mmla_4x16 (
"73:" // Height 3: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q14, [x28, #0x60]\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
+ "trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
+ ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n"
"ldr q4, [x28, #0x80]\n"
- ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
+ ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "sub x25, x25, #0x10\n"
+ ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
- ".inst 0x6e8ea413 // ummla v19.4s, v0.16b, v14.16b\n"
- ".inst 0x6e8ea45b // ummla v27.4s, v2.16b, v14.16b\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6e85a413 // ummla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a45b // ummla v27.4s, v2.16b, v5.16b\n"
"ldr q6, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e8ea417 // ummla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x6e8ea45f // ummla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n"
@@ -1040,34 +1040,34 @@ void a64_hybrid_u8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 78f\n"
"76:" // Height 3: Multiply loop: Odd block loop
- "ldr d3, [x24], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
"ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
"ldr d1, [x22], #0x8\n"
- "trn1 v0.2d, v3.2d, v0.2d\n"
"trn1 v2.2d, v1.2d, v2.2d\n"
"tbnz %x[flags], #31, 77f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x6e83a410 // ummla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n"
"ldr q7, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ "sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a414 // ummla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
@@ -1120,24 +1120,24 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"83:" // Height 3: Multiply loop: unique 12: skip row sum
"ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ ".inst 0x6e83a414 // ummla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45c // ummla v28.4s, v2.16b, v3.16b\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a411 // ummla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n"
".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n"
@@ -1151,18 +1151,18 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bne 68b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v24.2d, v24.2d, v28.2d\n"
"uzp1 v25.2d, v25.2d, v29.2d\n"
"uzp1 v26.2d, v26.2d, v30.2d\n"
@@ -1170,9 +1170,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 85f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "ld1r { v23.4s }, [x20]\n"
"neg v23.4s, v23.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
@@ -1192,10 +1192,10 @@ void a64_hybrid_u8qa_mmla_4x16 (
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v23.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
@@ -1233,18 +1233,18 @@ void a64_hybrid_u8qa_mmla_4x16 (
"and v29.16b, v21.16b, v0.16b\n"
"and v28.16b, v22.16b, v0.16b\n"
"and v23.16b, v16.16b, v0.16b\n"
- "and v3.16b, v17.16b, v0.16b\n"
"sshr v1.4s, v1.4s, #0x1f\n"
"sshr v30.4s, v30.4s, #0x1f\n"
"sshr v29.4s, v29.4s, #0x1f\n"
"sshr v28.4s, v28.4s, #0x1f\n"
"sshr v23.4s, v23.4s, #0x1f\n"
- "and v2.16b, v18.16b, v0.16b\n"
"sqadd v31.4s, v31.4s, v1.4s\n"
"sqadd v20.4s, v20.4s, v30.4s\n"
"sqadd v21.4s, v21.4s, v29.4s\n"
"sqadd v22.4s, v22.4s, v28.4s\n"
"sqadd v16.4s, v16.4s, v23.4s\n"
+ "and v3.16b, v17.16b, v0.16b\n"
+ "and v2.16b, v18.16b, v0.16b\n"
"and v1.16b, v19.16b, v0.16b\n"
"and v30.16b, v24.16b, v0.16b\n"
"and v29.16b, v25.16b, v0.16b\n"
@@ -1265,21 +1265,21 @@ void a64_hybrid_u8qa_mmla_4x16 (
"sqadd v26.4s, v26.4s, v28.4s\n"
"sqadd v27.4s, v27.4s, v23.4s\n"
"86:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "ld1r { v28.4s }, [x20]\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v23.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v24.4s, v24.4s, v0.4s\n"
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
@@ -1332,103 +1332,102 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bge 95f\n"
"tbz x9, #3, 90f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 88f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 87f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 94f\n"
"87:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 94f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 94f\n"
"88:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 89f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 94f\n"
"89:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 94f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 94f\n"
"90:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 92f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 91f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 94f\n"
"91:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 94f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 94f\n"
"92:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 93f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 94f\n"
"93:" // Height 3: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"94:" // Height 3: Partial direct writeback: Done
"b 96f\n"
"95:" // Height 3: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"96:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 66b\n"
"b 130f\n"
"97:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"98:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -1450,8 +1449,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov x26, #0x0\n"
"100:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 101f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1489,28 +1488,28 @@ void a64_hybrid_u8qa_mmla_4x16 (
"103:" // Height 4: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
"ldr q4, [x28, #0x60]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
"ldr q5, [x28, #0x70]\n"
+ ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n"
"ldr q6, [x28, #0x80]\n"
+ ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
@@ -1547,9 +1546,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q1, [x24, #0x0]\n"
"ldr q2, [x23, #0x0]\n"
"sub x25, x25, #0x10\n"
+ "cmp x25, #0x20\n"
"ldr q3, [x22, #0x0]\n"
"ldr q4, [x21, #0x0]\n"
- "cmp x25, #0x20\n"
"ldr q5, [x28, #0x0]\n"
"ldr q6, [x28, #0x10]\n"
"ldr q7, [x28, #0x20]\n"
@@ -1564,32 +1563,32 @@ void a64_hybrid_u8qa_mmla_4x16 (
"105:" // Height 4: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
"ldr q4, [x28, #0x60]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
"ldr q5, [x28, #0x70]\n"
+ ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n"
"ldr q6, [x28, #0x80]\n"
+ ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n"
"ldr q8, [x28, #0xa0]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n"
"ldr q9, [x28, #0xb0]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n"
"ldr q4, [x28, #0xd0]\n"
@@ -1629,35 +1628,35 @@ void a64_hybrid_u8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 110f\n"
"108:" // Height 4: Multiply loop: Odd block loop
- "ldr d3, [x24], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
"ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
"ldr d2, [x22], #0x8\n"
"ldr d1, [x21], #0x8\n"
- "trn1 v0.2d, v3.2d, v0.2d\n"
"trn1 v2.2d, v2.2d, v1.2d\n"
"tbnz %x[flags], #31, 109f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"109:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x6e83a410 // ummla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n"
"ldr q7, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ "sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a414 // ummla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
@@ -1717,24 +1716,24 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"115:" // Height 4: Multiply loop: unique 16: skip row sum
"ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
+ ".inst 0x6e83a414 // ummla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45c // ummla v28.4s, v2.16b, v3.16b\n"
"ldr q5, [x28, #0x40]\n"
"ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a411 // ummla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n"
"ldr q3, [x28, #0x60]\n"
- ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x28, #0x70]\n"
- ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
+ "add x28, x28, #0x80\n"
".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n"
".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n"
@@ -1748,22 +1747,22 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bne 100b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
+ "add x21, x22, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
"uzp2 v24.2d, v24.2d, v28.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v28.2d, v25.2d, v29.2d\n"
"uzp2 v25.2d, v25.2d, v29.2d\n"
"uzp1 v29.2d, v26.2d, v30.2d\n"
@@ -1773,9 +1772,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 117f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "ld1r { v0.4s }, [x20]\n"
"neg v0.4s, v0.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
@@ -1783,8 +1782,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"dup v13.4s, v13.s[0]\n"
"mul v11.4s, v11.4s, v0.4s\n"
"mul v12.4s, v12.4s, v0.4s\n"
- "mul v14.4s, v14.4s, v0.4s\n"
"mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"117:" // Height 4: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
"ldr q4, [x10, #0x10]\n"
@@ -1797,10 +1796,10 @@ void a64_hybrid_u8qa_mmla_4x16 (
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
"ld1r { v1.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add v23.4s, v23.4s, v13.4s\n"
"add v28.4s, v28.4s, v13.4s\n"
@@ -1847,32 +1846,32 @@ void a64_hybrid_u8qa_mmla_4x16 (
"tbz %x[flags], #5, 118f\n"
"and v2.16b, v31.16b, v0.16b\n"
"and v1.16b, v20.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v2.4s\n"
+ "sqadd v20.4s, v20.4s, v1.4s\n"
"and v7.16b, v21.16b, v0.16b\n"
"and v6.16b, v22.16b, v0.16b\n"
"and v5.16b, v16.16b, v0.16b\n"
"and v4.16b, v17.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
"and v3.16b, v18.16b, v0.16b\n"
+ "and v2.16b, v19.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v2.4s\n"
- "sqadd v20.4s, v20.4s, v1.4s\n"
- "and v2.16b, v19.16b, v0.16b\n"
- "and v1.16b, v23.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v7.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v7.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
"sqadd v16.4s, v16.4s, v5.4s\n"
"sqadd v17.4s, v17.4s, v4.4s\n"
"sqadd v18.4s, v18.4s, v3.4s\n"
- "and v7.16b, v28.16b, v0.16b\n"
"sqadd v19.4s, v19.4s, v2.4s\n"
"sqadd v23.4s, v23.4s, v1.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
"and v6.16b, v29.16b, v0.16b\n"
"and v5.16b, v30.16b, v0.16b\n"
"and v4.16b, v24.16b, v0.16b\n"
@@ -1894,21 +1893,21 @@ void a64_hybrid_u8qa_mmla_4x16 (
"sqadd v26.4s, v26.4s, v2.4s\n"
"sqadd v27.4s, v27.4s, v1.4s\n"
"118:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v3.4s }, [x21]\n"
- "ld1r { v2.4s }, [x20]\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
"ld1r { v1.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
+ "cmp x9, #0x10\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v28.4s, v28.4s, v0.4s\n"
"srshl v29.4s, v29.4s, v0.4s\n"
@@ -1980,100 +1979,100 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bge 127f\n"
"tbz x9, #3, 122f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x9, #2, 120f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v23.s }[2], [x23], #0x4\n"
- "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
"tbz x9, #1, 119f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v23.h }[6], [x23], #0x2\n"
- "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v23.h }[6], [x22], #0x2\n"
+ "st1 { v24.h }[6], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v23.b }[14], [x23]\n"
- "st1 { v24.b }[14], [x22]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v23.b }[14], [x22]\n"
+ "st1 { v24.b }[14], [x21]\n"
"b 126f\n"
"119:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 126f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v23.b }[12], [x23]\n"
- "st1 { v24.b }[12], [x22]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v23.b }[12], [x22]\n"
+ "st1 { v24.b }[12], [x21]\n"
"b 126f\n"
"120:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 121f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v23.h }[4], [x23], #0x2\n"
- "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v23.h }[4], [x22], #0x2\n"
+ "st1 { v24.h }[4], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v23.b }[10], [x23]\n"
- "st1 { v24.b }[10], [x22]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v23.b }[10], [x22]\n"
+ "st1 { v24.b }[10], [x21]\n"
"b 126f\n"
"121:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 126f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v23.b }[8], [x23]\n"
- "st1 { v24.b }[8], [x22]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v23.b }[8], [x22]\n"
+ "st1 { v24.b }[8], [x21]\n"
"b 126f\n"
"122:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 124f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s23, [x23], #0x4\n"
- "str s24, [x22], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
"tbz x9, #1, 123f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v23.h }[2], [x23], #0x2\n"
- "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v23.h }[2], [x22], #0x2\n"
+ "st1 { v24.h }[2], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v23.b }[6], [x23]\n"
- "st1 { v24.b }[6], [x22]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v23.b }[6], [x22]\n"
+ "st1 { v24.b }[6], [x21]\n"
"b 126f\n"
"123:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 126f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v23.b }[4], [x23]\n"
- "st1 { v24.b }[4], [x22]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v23.b }[4], [x22]\n"
+ "st1 { v24.b }[4], [x21]\n"
"b 126f\n"
"124:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 125f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h23, [x23], #0x2\n"
- "str h24, [x22], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h23, [x22], #0x2\n"
+ "str h24, [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v23.b }[2], [x23]\n"
- "st1 { v24.b }[2], [x22]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "st1 { v24.b }[2], [x21]\n"
"b 126f\n"
"125:" // Height 4: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b23, [x23, #0x0]\n"
- "str b24, [x22, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b23, [x22, #0x0]\n"
+ "str b24, [x21, #0x0]\n"
"126:" // Height 4: Partial direct writeback: Done
"b 128f\n"
"127:" // Height 4: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "str q23, [x23, #0x0]\n"
- "str q24, [x22, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q24, [x21, #0x0]\n"
"128:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 98b\n"
@@ -2089,8 +2088,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"130:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16.hpp
deleted file mode 100644
index be1947effc..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef __aarch64__
-
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<uint8_t>, \
- const Requantize32 *, const int32_t *, unsigned int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_hybrid_u8s8qa_dot_4x16( ARGLIST );
-
-class cls_a64_hybrid_u8s8qa_dot_4x16
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef uint8_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return 16;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return false;
- }
-
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 16, 4> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 7.5301 };
- default:
- return { 27.5482 };
- case CPUModel::A510:
- return { 14.81 };
- case CPUModel::V1:
- return { 44.54 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_hybrid_u8s8qa_dot_4x16;
- cls_a64_hybrid_u8s8qa_dot_4x16(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp
deleted file mode 100644
index e5ca848fb9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp
+++ /dev/null
@@ -1,2027 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_hybrid_u8s8qa_dot_4x16 (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
- const Requantize32 *qp, const int32_t *col_bias, unsigned int
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- if (qp->c_offset > qp->minval) {
- flags |= 0x20;
- }
- __asm__ __volatile__(
- "1:" // Row loop
- "cmp %x[M], #0x4\n"
- "bge 91f\n"
- "cmp %x[M], #0x2\n"
- "bgt 61f\n"
- "beq 31f\n"
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v15.16b, #0x1\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "3:" // Height 1: setup done
- "mov x26, #0x0\n"
- "4:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 5f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "cbnz x26, 6f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "b 6f\n"
- "5:" // Height 1: setup direct input
- "mov x24, %x[input_ptr]\n"
- "6:" // Height 1: input setup done
- "cmp x25, #0x10\n"
- "blt 11f\n"
- "ldr q0, [x24, #0x0]\n"
- "ldr q4, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "blt 9f\n"
- "7:" // Height 1: Multiply loop: Main loop head
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q21, [x28, #0x70]\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q20, [x28, #0x80]\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q26, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q25, [x28, #0xa0]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q24, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q23, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q22, [x28, #0xd0]\n"
- ".inst 0x4f20f2b3 // sudot v19.4s, v21.16b, v0.4b[1]\n"
- "ldr q21, [x28, #0xe0]\n"
- ".inst 0x4f00fa90 // sudot v16.4s, v20.16b, v0.4b[2]\n"
- "ldr q20, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f00fb51 // sudot v17.4s, v26.16b, v0.4b[2]\n"
- ".inst 0x4f00fb32 // sudot v18.4s, v25.16b, v0.4b[2]\n"
- ".inst 0x4f00fb13 // sudot v19.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f20faf0 // sudot v16.4s, v23.16b, v0.4b[3]\n"
- ".inst 0x4f20fad1 // sudot v17.4s, v22.16b, v0.4b[3]\n"
- ".inst 0x4f20fab2 // sudot v18.4s, v21.16b, v0.4b[3]\n"
- ".inst 0x4f20fa93 // sudot v19.4s, v20.16b, v0.4b[3]\n"
- "tbnz %x[flags], #31, 8f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "8:" // Height 1: Multiply loop: unique 1: skip row sum
- "ldr q0, [x24, #0x0]\n"
- "ldr q4, [x28, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "cmp x25, #0x20\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "bge 7b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q21, [x28, #0x70]\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q20, [x28, #0x80]\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q26, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q25, [x28, #0xa0]\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q24, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q23, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q22, [x28, #0xd0]\n"
- ".inst 0x4f20f2b3 // sudot v19.4s, v21.16b, v0.4b[1]\n"
- "ldr q21, [x28, #0xe0]\n"
- ".inst 0x4f00fa90 // sudot v16.4s, v20.16b, v0.4b[2]\n"
- "ldr q20, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f00fb51 // sudot v17.4s, v26.16b, v0.4b[2]\n"
- ".inst 0x4f00fb32 // sudot v18.4s, v25.16b, v0.4b[2]\n"
- ".inst 0x4f00fb13 // sudot v19.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f20faf0 // sudot v16.4s, v23.16b, v0.4b[3]\n"
- ".inst 0x4f20fad1 // sudot v17.4s, v22.16b, v0.4b[3]\n"
- ".inst 0x4f20fab2 // sudot v18.4s, v21.16b, v0.4b[3]\n"
- ".inst 0x4f20fa93 // sudot v19.4s, v20.16b, v0.4b[3]\n"
- "tbnz %x[flags], #31, 10f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "10:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "11:" // Height 1: Multiply loop: Main loop skip
- "cbz x25, 18f\n"
- "cmp x25, #0x4\n"
- "blt 14f\n"
- "12:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "tbnz %x[flags], #31, 13f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q23, [x28, #0x0]\n"
- "ldr q22, [x28, #0x10]\n"
- "sub x25, x25, #0x4\n"
- "ldr q21, [x28, #0x20]\n"
- "ldr q20, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f2f0 // sudot v16.4s, v23.16b, v0.4b[0]\n"
- ".inst 0x4f00f2d1 // sudot v17.4s, v22.16b, v0.4b[0]\n"
- ".inst 0x4f00f2b2 // sudot v18.4s, v21.16b, v0.4b[0]\n"
- ".inst 0x4f00f293 // sudot v19.4s, v20.16b, v0.4b[0]\n"
- "bge 12b\n"
- "14:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x25, 18f\n"
- "tbz x25, #1, 15f\n"
- "ldr h0, [x24], #0x2\n"
- "tbz x25, #0, 16f\n"
- "ld1 { v0.b }[2], [x24]\n"
- "b 16f\n"
- "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x24, #0x0]\n"
- "16:" // Height 1: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 17f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q23, [x28, #0x0]\n"
- "ldr q22, [x28, #0x10]\n"
- "ldr q21, [x28, #0x20]\n"
- "ldr q20, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f2f0 // sudot v16.4s, v23.16b, v0.4b[0]\n"
- ".inst 0x4f00f2d1 // sudot v17.4s, v22.16b, v0.4b[0]\n"
- ".inst 0x4f00f2b2 // sudot v18.4s, v21.16b, v0.4b[0]\n"
- ".inst 0x4f00f293 // sudot v19.4s, v20.16b, v0.4b[0]\n"
- "18:" // Height 1: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 4b\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "tbnz %x[flags], #31, 19f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "ld1r { v20.4s }, [x20]\n"
- "neg v20.4s, v20.4s\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "mul v11.4s, v11.4s, v20.4s\n"
- "19:" // Height 1: skip row sum fixup
- "ldr q24, [x10, #0x0]\n"
- "ldr q23, [x10, #0x10]\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "ldr q22, [x10, #0x20]\n"
- "ldr q21, [x10, #0x30]\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "add v19.4s, v19.4s, v11.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v20.4s }, [x20]\n"
- "add v16.4s, v16.4s, v24.4s\n"
- "add v17.4s, v17.4s, v23.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add x10, x10, #0x40\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v21.4s\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "sqrdmulh v17.4s, v17.4s, v20.4s\n"
- "sqrdmulh v18.4s, v18.4s, v20.4s\n"
- "sqrdmulh v19.4s, v19.4s, v20.4s\n"
- "tbz %x[flags], #5, 20f\n"
- "and v23.16b, v16.16b, v0.16b\n"
- "and v22.16b, v17.16b, v0.16b\n"
- "and v21.16b, v18.16b, v0.16b\n"
- "and v20.16b, v19.16b, v0.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v23.4s\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v20.4s\n"
- "20:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v22.4s }, [x21]\n"
- "ld1r { v21.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v20.4s }, [x20]\n"
- "add v16.4s, v16.4s, v22.4s\n"
- "add v17.4s, v17.4s, v22.4s\n"
- "add v18.4s, v18.4s, v22.4s\n"
- "add v19.4s, v19.4s, v22.4s\n"
- "smin v16.4s, v16.4s, v21.4s\n"
- "smin v17.4s, v17.4s, v21.4s\n"
- "smin v18.4s, v18.4s, v21.4s\n"
- "smin v19.4s, v19.4s, v21.4s\n"
- "smax v16.4s, v16.4s, v20.4s\n"
- "smax v17.4s, v17.4s, v20.4s\n"
- "smax v18.4s, v18.4s, v20.4s\n"
- "smax v19.4s, v19.4s, v20.4s\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "bge 29f\n"
- "tbz x9, #3, 24f\n"
- "str d16, [x27], #0x8\n"
- "tbz x9, #2, 22f\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "tbz x9, #1, 21f\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[14], [x27]\n"
- "b 28f\n"
- "21:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[12], [x27]\n"
- "b 28f\n"
- "22:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x9, #1, 23f\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[10], [x27]\n"
- "b 28f\n"
- "23:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[8], [x27]\n"
- "b 28f\n"
- "24:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x9, #2, 26f\n"
- "str s16, [x27], #0x4\n"
- "tbz x9, #1, 25f\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[6], [x27]\n"
- "b 28f\n"
- "25:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[4], [x27]\n"
- "b 28f\n"
- "26:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x9, #1, 27f\n"
- "str h16, [x27], #0x2\n"
- "tbz x9, #0, 28f\n"
- "st1 { v16.b }[2], [x27]\n"
- "b 28f\n"
- "27:" // Height 1: Partial direct writeback: partial_1_0
- "str b16, [x27, #0x0]\n"
- "28:" // Height 1: Partial direct writeback: Done
- "b 30f\n"
- "29:" // Height 1: Full writeback
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "30:" // Height 1: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 2b\n"
- "b 122f\n"
- "31:" // Height 2
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "movi v15.16b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "32:" // Height 2: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "33:" // Height 2: setup done
- "mov x26, #0x0\n"
- "34:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 35f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "cbnz x26, 36f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 36f\n"
- "35:" // Height 2: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "36:" // Height 2: input setup done
- "cmp x25, #0x10\n"
- "blt 41f\n"
- "ldr q0, [x24, #0x0]\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "blt 39f\n"
- "37:" // Height 2: Multiply loop: Main loop head
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q25, [x28, #0x70]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q24, [x28, #0x80]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q30, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q29, [x28, #0xa0]\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q28, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q27, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q26, [x28, #0xd0]\n"
- ".inst 0x4f20f333 // sudot v19.4s, v25.16b, v0.4b[1]\n"
- ".inst 0x4f21f337 // sudot v23.4s, v25.16b, v1.4b[1]\n"
- "ldr q25, [x28, #0xe0]\n"
- ".inst 0x4f00fb10 // sudot v16.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f01fb14 // sudot v20.4s, v24.16b, v1.4b[2]\n"
- "ldr q24, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f00fbd1 // sudot v17.4s, v30.16b, v0.4b[2]\n"
- ".inst 0x4f01fbd5 // sudot v21.4s, v30.16b, v1.4b[2]\n"
- ".inst 0x4f00fbb2 // sudot v18.4s, v29.16b, v0.4b[2]\n"
- ".inst 0x4f01fbb6 // sudot v22.4s, v29.16b, v1.4b[2]\n"
- ".inst 0x4f00fb93 // sudot v19.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb97 // sudot v23.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f20fb70 // sudot v16.4s, v27.16b, v0.4b[3]\n"
- ".inst 0x4f21fb74 // sudot v20.4s, v27.16b, v1.4b[3]\n"
- ".inst 0x4f20fb51 // sudot v17.4s, v26.16b, v0.4b[3]\n"
- ".inst 0x4f21fb55 // sudot v21.4s, v26.16b, v1.4b[3]\n"
- ".inst 0x4f20fb32 // sudot v18.4s, v25.16b, v0.4b[3]\n"
- ".inst 0x4f21fb36 // sudot v22.4s, v25.16b, v1.4b[3]\n"
- ".inst 0x4f20fb13 // sudot v19.4s, v24.16b, v0.4b[3]\n"
- ".inst 0x4f21fb17 // sudot v23.4s, v24.16b, v1.4b[3]\n"
- "tbnz %x[flags], #31, 38f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "38:" // Height 2: Multiply loop: unique 5: skip row sum
- "ldr q0, [x24, #0x0]\n"
- "ldr q1, [x23, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "cmp x25, #0x20\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "bge 37b\n"
- "39:" // Height 2: Multiply loop: Single iteration only
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q25, [x28, #0x70]\n"
- "sub x25, x25, #0x10\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q24, [x28, #0x80]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q30, [x28, #0x90]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q29, [x28, #0xa0]\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q28, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q27, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q26, [x28, #0xd0]\n"
- ".inst 0x4f20f333 // sudot v19.4s, v25.16b, v0.4b[1]\n"
- ".inst 0x4f21f337 // sudot v23.4s, v25.16b, v1.4b[1]\n"
- "ldr q25, [x28, #0xe0]\n"
- ".inst 0x4f00fb10 // sudot v16.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f01fb14 // sudot v20.4s, v24.16b, v1.4b[2]\n"
- "ldr q24, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f00fbd1 // sudot v17.4s, v30.16b, v0.4b[2]\n"
- ".inst 0x4f01fbd5 // sudot v21.4s, v30.16b, v1.4b[2]\n"
- ".inst 0x4f00fbb2 // sudot v18.4s, v29.16b, v0.4b[2]\n"
- ".inst 0x4f01fbb6 // sudot v22.4s, v29.16b, v1.4b[2]\n"
- ".inst 0x4f00fb93 // sudot v19.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb97 // sudot v23.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f20fb70 // sudot v16.4s, v27.16b, v0.4b[3]\n"
- ".inst 0x4f21fb74 // sudot v20.4s, v27.16b, v1.4b[3]\n"
- ".inst 0x4f20fb51 // sudot v17.4s, v26.16b, v0.4b[3]\n"
- ".inst 0x4f21fb55 // sudot v21.4s, v26.16b, v1.4b[3]\n"
- ".inst 0x4f20fb32 // sudot v18.4s, v25.16b, v0.4b[3]\n"
- ".inst 0x4f21fb36 // sudot v22.4s, v25.16b, v1.4b[3]\n"
- ".inst 0x4f20fb13 // sudot v19.4s, v24.16b, v0.4b[3]\n"
- ".inst 0x4f21fb17 // sudot v23.4s, v24.16b, v1.4b[3]\n"
- "tbnz %x[flags], #31, 40f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "40:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "41:" // Height 2: Multiply loop: Main loop skip
- "cbz x25, 48f\n"
- "cmp x25, #0x4\n"
- "blt 44f\n"
- "42:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "tbnz %x[flags], #31, 43f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q27, [x28, #0x0]\n"
- "ldr q26, [x28, #0x10]\n"
- "sub x25, x25, #0x4\n"
- "ldr q25, [x28, #0x20]\n"
- "ldr q24, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f370 // sudot v16.4s, v27.16b, v0.4b[0]\n"
- ".inst 0x4f01f374 // sudot v20.4s, v27.16b, v1.4b[0]\n"
- ".inst 0x4f00f351 // sudot v17.4s, v26.16b, v0.4b[0]\n"
- ".inst 0x4f01f355 // sudot v21.4s, v26.16b, v1.4b[0]\n"
- ".inst 0x4f00f332 // sudot v18.4s, v25.16b, v0.4b[0]\n"
- ".inst 0x4f01f336 // sudot v22.4s, v25.16b, v1.4b[0]\n"
- ".inst 0x4f00f313 // sudot v19.4s, v24.16b, v0.4b[0]\n"
- ".inst 0x4f01f317 // sudot v23.4s, v24.16b, v1.4b[0]\n"
- "bge 42b\n"
- "44:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x25, 48f\n"
- "tbz x25, #1, 45f\n"
- "ldr h0, [x24], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "tbz x25, #0, 46f\n"
- "ld1 { v0.b }[2], [x24]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "b 46f\n"
- "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x24, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "46:" // Height 2: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 47f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q27, [x28, #0x0]\n"
- "ldr q26, [x28, #0x10]\n"
- "ldr q25, [x28, #0x20]\n"
- "ldr q24, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f370 // sudot v16.4s, v27.16b, v0.4b[0]\n"
- ".inst 0x4f01f374 // sudot v20.4s, v27.16b, v1.4b[0]\n"
- ".inst 0x4f00f351 // sudot v17.4s, v26.16b, v0.4b[0]\n"
- ".inst 0x4f01f355 // sudot v21.4s, v26.16b, v1.4b[0]\n"
- ".inst 0x4f00f332 // sudot v18.4s, v25.16b, v0.4b[0]\n"
- ".inst 0x4f01f336 // sudot v22.4s, v25.16b, v1.4b[0]\n"
- ".inst 0x4f00f313 // sudot v19.4s, v24.16b, v0.4b[0]\n"
- ".inst 0x4f01f317 // sudot v23.4s, v24.16b, v1.4b[0]\n"
- "48:" // Height 2: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 34b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "tbnz %x[flags], #31, 49f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v24.4s }, [x20]\n"
- "neg v24.4s, v24.4s\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v12.4s, v12.4s, v12.4s\n"
- "mul v11.4s, v11.4s, v24.4s\n"
- "mul v12.4s, v12.4s, v24.4s\n"
- "49:" // Height 2: skip row sum fixup
- "ldr q28, [x10, #0x0]\n"
- "ldr q27, [x10, #0x10]\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "ldr q26, [x10, #0x20]\n"
- "ldr q25, [x10, #0x30]\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "add v19.4s, v19.4s, v11.4s\n"
- "add v20.4s, v20.4s, v12.4s\n"
- "add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v24.4s }, [x20]\n"
- "add v22.4s, v22.4s, v12.4s\n"
- "add v23.4s, v23.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add v16.4s, v16.4s, v28.4s\n"
- "add v17.4s, v17.4s, v27.4s\n"
- "add x10, x10, #0x40\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v25.4s\n"
- "add v20.4s, v20.4s, v28.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v21.4s, v21.4s, v27.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v25.4s\n"
- "sqrdmulh v16.4s, v16.4s, v24.4s\n"
- "sqrdmulh v17.4s, v17.4s, v24.4s\n"
- "sqrdmulh v18.4s, v18.4s, v24.4s\n"
- "sqrdmulh v19.4s, v19.4s, v24.4s\n"
- "sqrdmulh v20.4s, v20.4s, v24.4s\n"
- "sqrdmulh v21.4s, v21.4s, v24.4s\n"
- "sqrdmulh v22.4s, v22.4s, v24.4s\n"
- "sqrdmulh v23.4s, v23.4s, v24.4s\n"
- "tbz %x[flags], #5, 50f\n"
- "and v24.16b, v16.16b, v0.16b\n"
- "and v30.16b, v17.16b, v0.16b\n"
- "and v29.16b, v18.16b, v0.16b\n"
- "and v28.16b, v19.16b, v0.16b\n"
- "and v27.16b, v20.16b, v0.16b\n"
- "and v26.16b, v21.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "and v25.16b, v22.16b, v0.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v24.4s\n"
- "and v24.16b, v23.16b, v0.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v29.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v20.4s, v20.4s, v27.4s\n"
- "sqadd v21.4s, v21.4s, v26.4s\n"
- "sqadd v22.4s, v22.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v24.4s\n"
- "50:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v24.4s }, [x20]\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "add v16.4s, v16.4s, v26.4s\n"
- "add v17.4s, v17.4s, v26.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v26.4s\n"
- "add v20.4s, v20.4s, v26.4s\n"
- "add v21.4s, v21.4s, v26.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v23.4s, v23.4s, v26.4s\n"
- "smin v16.4s, v16.4s, v25.4s\n"
- "smin v17.4s, v17.4s, v25.4s\n"
- "smin v18.4s, v18.4s, v25.4s\n"
- "smin v19.4s, v19.4s, v25.4s\n"
- "smin v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v25.4s\n"
- "smin v22.4s, v22.4s, v25.4s\n"
- "smin v23.4s, v23.4s, v25.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "smax v18.4s, v18.4s, v24.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "smax v22.4s, v22.4s, v24.4s\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v18.8h, v18.8h, v19.8h\n"
- "uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v17.8h, v22.8h, v23.8h\n"
- "uzp1 v16.16b, v16.16b, v18.16b\n"
- "uzp1 v20.16b, v20.16b, v17.16b\n"
- "bge 59f\n"
- "tbz x9, #3, 54f\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "tbz x9, #2, 52f\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "tbz x9, #1, 51f\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "b 58f\n"
- "51:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "b 58f\n"
- "52:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x9, #1, 53f\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "b 58f\n"
- "53:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "b 58f\n"
- "54:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x9, #2, 56f\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "tbz x9, #1, 55f\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "b 58f\n"
- "55:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "b 58f\n"
- "56:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x9, #1, 57f\n"
- "str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "tbz x9, #0, 58f\n"
- "st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "b 58f\n"
- "57:" // Height 2: Partial direct writeback: partial_1_0
- "str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "58:" // Height 2: Partial direct writeback: Done
- "b 60f\n"
- "59:" // Height 2: Full writeback
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "60:" // Height 2: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 32b\n"
- "b 122f\n"
- "61:" // Height 3
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "movi v13.4s, #0x0\n"
- "movi v15.16b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "62:" // Height 3: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "63:" // Height 3: setup done
- "mov x26, #0x0\n"
- "64:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 65f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x26, 66f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 66f\n"
- "65:" // Height 3: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "66:" // Height 3: input setup done
- "cmp x25, #0x10\n"
- "blt 71f\n"
- "ldr q0, [x24, #0x0]\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "blt 69f\n"
- "67:" // Height 3: Multiply loop: Main loop head
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q29, [x28, #0x70]\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q28, [x28, #0x80]\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q4, [x28, #0xa0]\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n"
- ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q3, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q31, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q30, [x28, #0xd0]\n"
- ".inst 0x4f20f3b3 // sudot v19.4s, v29.16b, v0.4b[1]\n"
- ".inst 0x4f21f3b7 // sudot v23.4s, v29.16b, v1.4b[1]\n"
- ".inst 0x4f22f3bb // sudot v27.4s, v29.16b, v2.4b[1]\n"
- "ldr q29, [x28, #0xe0]\n"
- ".inst 0x4f00fb90 // sudot v16.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb94 // sudot v20.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f02fb98 // sudot v24.4s, v28.16b, v2.4b[2]\n"
- "ldr q28, [x28, #0xf0]\n"
- ".inst 0x4f00f8b1 // sudot v17.4s, v5.16b, v0.4b[2]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f01f8b5 // sudot v21.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f02f8b9 // sudot v25.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4f00f892 // sudot v18.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4f01f896 // sudot v22.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4f02f89a // sudot v26.4s, v4.16b, v2.4b[2]\n"
- ".inst 0x4f00f873 // sudot v19.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x4f01f877 // sudot v23.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x4f02f87b // sudot v27.4s, v3.16b, v2.4b[2]\n"
- ".inst 0x4f20fbf0 // sudot v16.4s, v31.16b, v0.4b[3]\n"
- ".inst 0x4f21fbf4 // sudot v20.4s, v31.16b, v1.4b[3]\n"
- ".inst 0x4f22fbf8 // sudot v24.4s, v31.16b, v2.4b[3]\n"
- ".inst 0x4f20fbd1 // sudot v17.4s, v30.16b, v0.4b[3]\n"
- ".inst 0x4f21fbd5 // sudot v21.4s, v30.16b, v1.4b[3]\n"
- ".inst 0x4f22fbd9 // sudot v25.4s, v30.16b, v2.4b[3]\n"
- ".inst 0x4f20fbb2 // sudot v18.4s, v29.16b, v0.4b[3]\n"
- ".inst 0x4f21fbb6 // sudot v22.4s, v29.16b, v1.4b[3]\n"
- ".inst 0x4f22fbba // sudot v26.4s, v29.16b, v2.4b[3]\n"
- ".inst 0x4f20fb93 // sudot v19.4s, v28.16b, v0.4b[3]\n"
- ".inst 0x4f21fb97 // sudot v23.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x4f22fb9b // sudot v27.4s, v28.16b, v2.4b[3]\n"
- "tbnz %x[flags], #31, 68f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "68:" // Height 3: Multiply loop: unique 9: skip row sum
- "ldr q0, [x24, #0x0]\n"
- "ldr q1, [x23, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "bge 67b\n"
- "69:" // Height 3: Multiply loop: Single iteration only
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q29, [x28, #0x70]\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q28, [x28, #0x80]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q4, [x28, #0xa0]\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n"
- ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q3, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q31, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q30, [x28, #0xd0]\n"
- ".inst 0x4f20f3b3 // sudot v19.4s, v29.16b, v0.4b[1]\n"
- ".inst 0x4f21f3b7 // sudot v23.4s, v29.16b, v1.4b[1]\n"
- ".inst 0x4f22f3bb // sudot v27.4s, v29.16b, v2.4b[1]\n"
- "ldr q29, [x28, #0xe0]\n"
- ".inst 0x4f00fb90 // sudot v16.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb94 // sudot v20.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f02fb98 // sudot v24.4s, v28.16b, v2.4b[2]\n"
- "ldr q28, [x28, #0xf0]\n"
- ".inst 0x4f00f8b1 // sudot v17.4s, v5.16b, v0.4b[2]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f01f8b5 // sudot v21.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f02f8b9 // sudot v25.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4f00f892 // sudot v18.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4f01f896 // sudot v22.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4f02f89a // sudot v26.4s, v4.16b, v2.4b[2]\n"
- ".inst 0x4f00f873 // sudot v19.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x4f01f877 // sudot v23.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x4f02f87b // sudot v27.4s, v3.16b, v2.4b[2]\n"
- ".inst 0x4f20fbf0 // sudot v16.4s, v31.16b, v0.4b[3]\n"
- ".inst 0x4f21fbf4 // sudot v20.4s, v31.16b, v1.4b[3]\n"
- ".inst 0x4f22fbf8 // sudot v24.4s, v31.16b, v2.4b[3]\n"
- ".inst 0x4f20fbd1 // sudot v17.4s, v30.16b, v0.4b[3]\n"
- ".inst 0x4f21fbd5 // sudot v21.4s, v30.16b, v1.4b[3]\n"
- ".inst 0x4f22fbd9 // sudot v25.4s, v30.16b, v2.4b[3]\n"
- ".inst 0x4f20fbb2 // sudot v18.4s, v29.16b, v0.4b[3]\n"
- ".inst 0x4f21fbb6 // sudot v22.4s, v29.16b, v1.4b[3]\n"
- ".inst 0x4f22fbba // sudot v26.4s, v29.16b, v2.4b[3]\n"
- ".inst 0x4f20fb93 // sudot v19.4s, v28.16b, v0.4b[3]\n"
- ".inst 0x4f21fb97 // sudot v23.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x4f22fb9b // sudot v27.4s, v28.16b, v2.4b[3]\n"
- "tbnz %x[flags], #31, 70f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "70:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "71:" // Height 3: Multiply loop: Main loop skip
- "cbz x25, 78f\n"
- "cmp x25, #0x4\n"
- "blt 74f\n"
- "72:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "tbnz %x[flags], #31, 73f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q31, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "sub x25, x25, #0x4\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f3f0 // sudot v16.4s, v31.16b, v0.4b[0]\n"
- ".inst 0x4f01f3f4 // sudot v20.4s, v31.16b, v1.4b[0]\n"
- ".inst 0x4f02f3f8 // sudot v24.4s, v31.16b, v2.4b[0]\n"
- ".inst 0x4f00f3d1 // sudot v17.4s, v30.16b, v0.4b[0]\n"
- ".inst 0x4f01f3d5 // sudot v21.4s, v30.16b, v1.4b[0]\n"
- ".inst 0x4f02f3d9 // sudot v25.4s, v30.16b, v2.4b[0]\n"
- ".inst 0x4f00f3b2 // sudot v18.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f01f3b6 // sudot v22.4s, v29.16b, v1.4b[0]\n"
- ".inst 0x4f02f3ba // sudot v26.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f00f393 // sudot v19.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f01f397 // sudot v23.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f02f39b // sudot v27.4s, v28.16b, v2.4b[0]\n"
- "bge 72b\n"
- "74:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x25, 78f\n"
- "tbz x25, #1, 75f\n"
- "ldr h0, [x24], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "tbz x25, #0, 76f\n"
- "ld1 { v0.b }[2], [x24]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "ld1 { v2.b }[2], [x22]\n"
- "b 76f\n"
- "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x24, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
- "76:" // Height 3: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 77f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q31, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f3f0 // sudot v16.4s, v31.16b, v0.4b[0]\n"
- ".inst 0x4f01f3f4 // sudot v20.4s, v31.16b, v1.4b[0]\n"
- ".inst 0x4f02f3f8 // sudot v24.4s, v31.16b, v2.4b[0]\n"
- ".inst 0x4f00f3d1 // sudot v17.4s, v30.16b, v0.4b[0]\n"
- ".inst 0x4f01f3d5 // sudot v21.4s, v30.16b, v1.4b[0]\n"
- ".inst 0x4f02f3d9 // sudot v25.4s, v30.16b, v2.4b[0]\n"
- ".inst 0x4f00f3b2 // sudot v18.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f01f3b6 // sudot v22.4s, v29.16b, v1.4b[0]\n"
- ".inst 0x4f02f3ba // sudot v26.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f00f393 // sudot v19.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f01f397 // sudot v23.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f02f39b // sudot v27.4s, v28.16b, v2.4b[0]\n"
- "78:" // Height 3: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 64b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "tbnz %x[flags], #31, 79f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v28.4s }, [x20]\n"
- "addp v13.4s, v13.4s, v13.4s\n"
- "neg v28.4s, v28.4s\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v12.4s, v12.4s, v12.4s\n"
- "addp v13.4s, v13.4s, v13.4s\n"
- "mul v11.4s, v11.4s, v28.4s\n"
- "mul v12.4s, v12.4s, v28.4s\n"
- "mul v13.4s, v13.4s, v28.4s\n"
- "79:" // Height 3: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q31, [x10, #0x10]\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "ldr q30, [x10, #0x20]\n"
- "ldr q29, [x10, #0x30]\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "add v19.4s, v19.4s, v11.4s\n"
- "add v20.4s, v20.4s, v12.4s\n"
- "add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v28.4s }, [x20]\n"
- "add v22.4s, v22.4s, v12.4s\n"
- "add v23.4s, v23.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add x10, x10, #0x40\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v31.4s\n"
- "add v18.4s, v18.4s, v30.4s\n"
- "add v19.4s, v19.4s, v29.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v31.4s\n"
- "add v22.4s, v22.4s, v30.4s\n"
- "add v23.4s, v23.4s, v29.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v25.4s, v25.4s, v31.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "add v27.4s, v27.4s, v29.4s\n"
- "sqrdmulh v16.4s, v16.4s, v28.4s\n"
- "sqrdmulh v17.4s, v17.4s, v28.4s\n"
- "sqrdmulh v18.4s, v18.4s, v28.4s\n"
- "sqrdmulh v19.4s, v19.4s, v28.4s\n"
- "sqrdmulh v20.4s, v20.4s, v28.4s\n"
- "sqrdmulh v21.4s, v21.4s, v28.4s\n"
- "sqrdmulh v22.4s, v22.4s, v28.4s\n"
- "sqrdmulh v23.4s, v23.4s, v28.4s\n"
- "sqrdmulh v24.4s, v24.4s, v28.4s\n"
- "sqrdmulh v25.4s, v25.4s, v28.4s\n"
- "sqrdmulh v26.4s, v26.4s, v28.4s\n"
- "sqrdmulh v27.4s, v27.4s, v28.4s\n"
- "tbz %x[flags], #5, 80f\n"
- "and v1.16b, v16.16b, v0.16b\n"
- "and v31.16b, v17.16b, v0.16b\n"
- "and v30.16b, v18.16b, v0.16b\n"
- "and v29.16b, v19.16b, v0.16b\n"
- "and v28.16b, v20.16b, v0.16b\n"
- "and v3.16b, v21.16b, v0.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v2.16b, v22.16b, v0.16b\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "sqadd v17.4s, v17.4s, v31.4s\n"
- "sqadd v18.4s, v18.4s, v30.4s\n"
- "sqadd v19.4s, v19.4s, v29.4s\n"
- "sqadd v20.4s, v20.4s, v28.4s\n"
- "and v1.16b, v23.16b, v0.16b\n"
- "and v31.16b, v24.16b, v0.16b\n"
- "and v30.16b, v25.16b, v0.16b\n"
- "and v29.16b, v26.16b, v0.16b\n"
- "and v28.16b, v27.16b, v0.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v3.4s\n"
- "sqadd v22.4s, v22.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v1.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "sqadd v25.4s, v25.4s, v30.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "sqadd v27.4s, v27.4s, v28.4s\n"
- "80:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v30.4s }, [x21]\n"
- "ld1r { v29.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v28.4s }, [x20]\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "srshl v24.4s, v24.4s, v0.4s\n"
- "srshl v25.4s, v25.4s, v0.4s\n"
- "srshl v26.4s, v26.4s, v0.4s\n"
- "srshl v27.4s, v27.4s, v0.4s\n"
- "add v16.4s, v16.4s, v30.4s\n"
- "add v17.4s, v17.4s, v30.4s\n"
- "add v18.4s, v18.4s, v30.4s\n"
- "add v19.4s, v19.4s, v30.4s\n"
- "add v20.4s, v20.4s, v30.4s\n"
- "add v21.4s, v21.4s, v30.4s\n"
- "add v22.4s, v22.4s, v30.4s\n"
- "add v23.4s, v23.4s, v30.4s\n"
- "add v24.4s, v24.4s, v30.4s\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "add v27.4s, v27.4s, v30.4s\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v19.4s, v19.4s, v29.4s\n"
- "smin v20.4s, v20.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v29.4s\n"
- "smin v22.4s, v22.4s, v29.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v24.4s, v24.4s, v29.4s\n"
- "smin v25.4s, v25.4s, v29.4s\n"
- "smin v26.4s, v26.4s, v29.4s\n"
- "smin v27.4s, v27.4s, v29.4s\n"
- "smax v16.4s, v16.4s, v28.4s\n"
- "smax v17.4s, v17.4s, v28.4s\n"
- "smax v18.4s, v18.4s, v28.4s\n"
- "smax v19.4s, v19.4s, v28.4s\n"
- "smax v20.4s, v20.4s, v28.4s\n"
- "smax v21.4s, v21.4s, v28.4s\n"
- "smax v22.4s, v22.4s, v28.4s\n"
- "smax v23.4s, v23.4s, v28.4s\n"
- "smax v24.4s, v24.4s, v28.4s\n"
- "smax v25.4s, v25.4s, v28.4s\n"
- "smax v26.4s, v26.4s, v28.4s\n"
- "smax v27.4s, v27.4s, v28.4s\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v19.8h, v18.8h, v19.8h\n"
- "uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v18.8h, v22.8h, v23.8h\n"
- "uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v17.8h, v26.8h, v27.8h\n"
- "uzp1 v16.16b, v16.16b, v19.16b\n"
- "uzp1 v20.16b, v20.16b, v18.16b\n"
- "uzp1 v24.16b, v24.16b, v17.16b\n"
- "bge 89f\n"
- "tbz x9, #3, 84f\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x9, #2, 82f\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x9, #1, 81f\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "b 88f\n"
- "81:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "b 88f\n"
- "82:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x9, #1, 83f\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "b 88f\n"
- "83:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "b 88f\n"
- "84:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x9, #2, 86f\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x9, #1, 85f\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "b 88f\n"
- "85:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "b 88f\n"
- "86:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x9, #1, 87f\n"
- "str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x9, #0, 88f\n"
- "st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "b 88f\n"
- "87:" // Height 3: Partial direct writeback: partial_1_0
- "str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "88:" // Height 3: Partial direct writeback: Done
- "b 90f\n"
- "89:" // Height 3: Full writeback
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "90:" // Height 3: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 62b\n"
- "b 122f\n"
- "91:" // Height 4
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x4\n"
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "92:" // Height 4: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "93:" // Height 4: setup done
- "mov x26, #0x0\n"
- "94:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 95f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "ldr x21, [x20, #0x18]\n"
- "cbnz x26, 96f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 96f\n"
- "95:" // Height 4: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "96:" // Height 4: input setup done
- "cmp x25, #0x10\n"
- "blt 101f\n"
- "ldr q0, [x24, #0x0]\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "blt 99f\n"
- "97:" // Height 4: Multiply loop: Main loop head
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f03f09c // sudot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4f03f0bd // sudot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0de // sudot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0ff // sudot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n"
- ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n"
- ".inst 0x4f23f11c // sudot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4f23f13d // sudot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n"
- ".inst 0x4f23f15e // sudot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f22f09b // sudot v27.4s, v4.16b, v2.4b[1]\n"
- ".inst 0x4f23f09f // sudot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f02f8b8 // sudot v24.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4f03f8bc // sudot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f02f8d9 // sudot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f03f8dd // sudot v29.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f01f8f6 // sudot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f02f8fa // sudot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f03f8fe // sudot v30.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f01f917 // sudot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f02f91b // sudot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f03f91f // sudot v31.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4f21f934 // sudot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4f22f938 // sudot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4f23f93c // sudot v28.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4f21f955 // sudot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4f22f959 // sudot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x4f23f95d // sudot v29.4s, v10.16b, v3.4b[3]\n"
- ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f21f896 // sudot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4f22f89a // sudot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4f23f89e // sudot v30.4s, v4.16b, v3.4b[3]\n"
- ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4f22f8bb // sudot v27.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4f23f8bf // sudot v31.4s, v5.16b, v3.4b[3]\n"
- "tbnz %x[flags], #31, 98f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "98:" // Height 4: Multiply loop: unique 13: skip row sum
- "ldr q0, [x24, #0x0]\n"
- "ldr q1, [x23, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q8, [x28, #0x40]\n"
- "ldr q9, [x28, #0x50]\n"
- "ldr q10, [x28, #0x60]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "bge 97b\n"
- "99:" // Height 4: Multiply loop: Single iteration only
- ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f03f09c // sudot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4f03f0bd // sudot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
- ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0de // sudot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
- ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0ff // sudot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
- ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n"
- ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n"
- ".inst 0x4f23f11c // sudot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
- ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4f23f13d // sudot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
- ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n"
- ".inst 0x4f23f15e // sudot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f22f09b // sudot v27.4s, v4.16b, v2.4b[1]\n"
- ".inst 0x4f23f09f // sudot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f02f8b8 // sudot v24.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4f03f8bc // sudot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f02f8d9 // sudot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f03f8dd // sudot v29.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f01f8f6 // sudot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f02f8fa // sudot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f03f8fe // sudot v30.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f01f917 // sudot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f02f91b // sudot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f03f91f // sudot v31.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4f21f934 // sudot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4f22f938 // sudot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4f23f93c // sudot v28.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4f21f955 // sudot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4f22f959 // sudot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x4f23f95d // sudot v29.4s, v10.16b, v3.4b[3]\n"
- ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f21f896 // sudot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4f22f89a // sudot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4f23f89e // sudot v30.4s, v4.16b, v3.4b[3]\n"
- ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4f22f8bb // sudot v27.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4f23f8bf // sudot v31.4s, v5.16b, v3.4b[3]\n"
- "tbnz %x[flags], #31, 100f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "100:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "101:" // Height 4: Multiply loop: Main loop skip
- "cbz x25, 108f\n"
- "cmp x25, #0x4\n"
- "blt 104f\n"
- "102:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s3, [x21], #0x4\n"
- "tbnz %x[flags], #31, 103f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q7, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "sub x25, x25, #0x4\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
- "cmp x25, #0x4\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f0f0 // sudot v16.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f4 // sudot v20.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f8 // sudot v24.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0fc // sudot v28.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f00f0d1 // sudot v17.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d5 // sudot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0d9 // sudot v25.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0dd // sudot v29.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f01f0b6 // sudot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f02f0ba // sudot v26.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4f03f0be // sudot v30.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x4f00f093 // sudot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f097 // sudot v23.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f02f09b // sudot v27.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f03f09f // sudot v31.4s, v4.16b, v3.4b[0]\n"
- "bge 102b\n"
- "104:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x25, 108f\n"
- "tbz x25, #1, 105f\n"
- "ldr h0, [x24], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h3, [x21], #0x2\n"
- "tbz x25, #0, 106f\n"
- "ld1 { v0.b }[2], [x24]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "ld1 { v2.b }[2], [x22]\n"
- "ld1 { v3.b }[2], [x21]\n"
- "b 106f\n"
- "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x24, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
- "ldr b3, [x21, #0x0]\n"
- "106:" // Height 4: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 107f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q7, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
- "add x28, x28, #0x40\n"
- ".inst 0x4f00f0f0 // sudot v16.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0f4 // sudot v20.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f8 // sudot v24.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0fc // sudot v28.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f00f0d1 // sudot v17.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0d5 // sudot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0d9 // sudot v25.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0dd // sudot v29.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f01f0b6 // sudot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f02f0ba // sudot v26.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4f03f0be // sudot v30.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x4f00f093 // sudot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f01f097 // sudot v23.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f02f09b // sudot v27.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f03f09f // sudot v31.4s, v4.16b, v3.4b[0]\n"
- "108:" // Height 4: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 94b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x24, x27, x20\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "tbnz %x[flags], #31, 109f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v12.4s, v12.4s, v12.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "addp v13.4s, v13.4s, v13.4s\n"
- "addp v14.4s, v14.4s, v14.4s\n"
- "neg v0.4s, v0.4s\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v12.4s, v12.4s, v12.4s\n"
- "addp v13.4s, v13.4s, v13.4s\n"
- "addp v14.4s, v14.4s, v14.4s\n"
- "mul v11.4s, v11.4s, v0.4s\n"
- "mul v12.4s, v12.4s, v0.4s\n"
- "mul v13.4s, v13.4s, v0.4s\n"
- "mul v14.4s, v14.4s, v0.4s\n"
- "109:" // Height 4: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q4, [x10, #0x10]\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "ldr q3, [x10, #0x20]\n"
- "ldr q2, [x10, #0x30]\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "add v19.4s, v19.4s, v11.4s\n"
- "add v20.4s, v20.4s, v12.4s\n"
- "add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v1.4s }, [x20]\n"
- "add v22.4s, v22.4s, v12.4s\n"
- "add v23.4s, v23.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add x10, x10, #0x40\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v3.4s\n"
- "add v19.4s, v19.4s, v2.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v23.4s, v23.4s, v2.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v3.4s\n"
- "add v27.4s, v27.4s, v2.4s\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v3.4s\n"
- "add v31.4s, v31.4s, v2.4s\n"
- "sqrdmulh v16.4s, v16.4s, v1.4s\n"
- "sqrdmulh v17.4s, v17.4s, v1.4s\n"
- "sqrdmulh v18.4s, v18.4s, v1.4s\n"
- "sqrdmulh v19.4s, v19.4s, v1.4s\n"
- "sqrdmulh v20.4s, v20.4s, v1.4s\n"
- "sqrdmulh v21.4s, v21.4s, v1.4s\n"
- "sqrdmulh v22.4s, v22.4s, v1.4s\n"
- "sqrdmulh v23.4s, v23.4s, v1.4s\n"
- "sqrdmulh v24.4s, v24.4s, v1.4s\n"
- "sqrdmulh v25.4s, v25.4s, v1.4s\n"
- "sqrdmulh v26.4s, v26.4s, v1.4s\n"
- "sqrdmulh v27.4s, v27.4s, v1.4s\n"
- "sqrdmulh v28.4s, v28.4s, v1.4s\n"
- "sqrdmulh v29.4s, v29.4s, v1.4s\n"
- "sqrdmulh v30.4s, v30.4s, v1.4s\n"
- "sqrdmulh v31.4s, v31.4s, v1.4s\n"
- "tbz %x[flags], #5, 110f\n"
- "and v2.16b, v16.16b, v0.16b\n"
- "and v1.16b, v17.16b, v0.16b\n"
- "and v7.16b, v18.16b, v0.16b\n"
- "and v6.16b, v19.16b, v0.16b\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "and v4.16b, v21.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v3.16b, v22.16b, v0.16b\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v2.4s\n"
- "sqadd v17.4s, v17.4s, v1.4s\n"
- "and v2.16b, v23.16b, v0.16b\n"
- "and v1.16b, v24.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v7.4s\n"
- "sqadd v19.4s, v19.4s, v6.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v4.4s\n"
- "sqadd v22.4s, v22.4s, v3.4s\n"
- "and v7.16b, v25.16b, v0.16b\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "sqadd v24.4s, v24.4s, v1.4s\n"
- "and v6.16b, v26.16b, v0.16b\n"
- "and v5.16b, v27.16b, v0.16b\n"
- "and v4.16b, v28.16b, v0.16b\n"
- "and v3.16b, v29.16b, v0.16b\n"
- "and v2.16b, v30.16b, v0.16b\n"
- "and v1.16b, v31.16b, v0.16b\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v7.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v5.4s\n"
- "sqadd v28.4s, v28.4s, v4.4s\n"
- "sqadd v29.4s, v29.4s, v3.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "110:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v3.4s }, [x21]\n"
- "ld1r { v2.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v1.4s }, [x20]\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "srshl v24.4s, v24.4s, v0.4s\n"
- "srshl v25.4s, v25.4s, v0.4s\n"
- "srshl v26.4s, v26.4s, v0.4s\n"
- "srshl v27.4s, v27.4s, v0.4s\n"
- "srshl v28.4s, v28.4s, v0.4s\n"
- "srshl v29.4s, v29.4s, v0.4s\n"
- "srshl v30.4s, v30.4s, v0.4s\n"
- "srshl v31.4s, v31.4s, v0.4s\n"
- "add v16.4s, v16.4s, v3.4s\n"
- "add v17.4s, v17.4s, v3.4s\n"
- "add v18.4s, v18.4s, v3.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v3.4s\n"
- "add v21.4s, v21.4s, v3.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v3.4s\n"
- "add v25.4s, v25.4s, v3.4s\n"
- "add v26.4s, v26.4s, v3.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add v28.4s, v28.4s, v3.4s\n"
- "add v29.4s, v29.4s, v3.4s\n"
- "add v30.4s, v30.4s, v3.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
- "smin v16.4s, v16.4s, v2.4s\n"
- "smin v17.4s, v17.4s, v2.4s\n"
- "smin v18.4s, v18.4s, v2.4s\n"
- "smin v19.4s, v19.4s, v2.4s\n"
- "smin v20.4s, v20.4s, v2.4s\n"
- "smin v21.4s, v21.4s, v2.4s\n"
- "smin v22.4s, v22.4s, v2.4s\n"
- "smin v23.4s, v23.4s, v2.4s\n"
- "smin v24.4s, v24.4s, v2.4s\n"
- "smin v25.4s, v25.4s, v2.4s\n"
- "smin v26.4s, v26.4s, v2.4s\n"
- "smin v27.4s, v27.4s, v2.4s\n"
- "smin v28.4s, v28.4s, v2.4s\n"
- "smin v29.4s, v29.4s, v2.4s\n"
- "smin v30.4s, v30.4s, v2.4s\n"
- "smin v31.4s, v31.4s, v2.4s\n"
- "smax v16.4s, v16.4s, v1.4s\n"
- "smax v17.4s, v17.4s, v1.4s\n"
- "smax v18.4s, v18.4s, v1.4s\n"
- "smax v19.4s, v19.4s, v1.4s\n"
- "smax v20.4s, v20.4s, v1.4s\n"
- "smax v21.4s, v21.4s, v1.4s\n"
- "smax v22.4s, v22.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v1.4s\n"
- "smax v24.4s, v24.4s, v1.4s\n"
- "smax v25.4s, v25.4s, v1.4s\n"
- "smax v26.4s, v26.4s, v1.4s\n"
- "smax v27.4s, v27.4s, v1.4s\n"
- "smax v28.4s, v28.4s, v1.4s\n"
- "smax v29.4s, v29.4s, v1.4s\n"
- "smax v30.4s, v30.4s, v1.4s\n"
- "smax v31.4s, v31.4s, v1.4s\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v0.8h, v18.8h, v19.8h\n"
- "uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v19.8h, v22.8h, v23.8h\n"
- "uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v18.8h, v26.8h, v27.8h\n"
- "uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v17.8h, v30.8h, v31.8h\n"
- "uzp1 v16.16b, v16.16b, v0.16b\n"
- "uzp1 v20.16b, v20.16b, v19.16b\n"
- "uzp1 v24.16b, v24.16b, v18.16b\n"
- "uzp1 v28.16b, v28.16b, v17.16b\n"
- "bge 119f\n"
- "tbz x9, #3, 114f\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x22], #0x8\n"
- "tbz x9, #2, 112f\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x22], #0x4\n"
- "tbz x9, #1, 111f\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x22], #0x2\n"
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x22]\n"
- "b 118f\n"
- "111:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x22]\n"
- "b 118f\n"
- "112:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x9, #1, 113f\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x22], #0x2\n"
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x22]\n"
- "b 118f\n"
- "113:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x22]\n"
- "b 118f\n"
- "114:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x9, #2, 116f\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x22], #0x4\n"
- "tbz x9, #1, 115f\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x22], #0x2\n"
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x22]\n"
- "b 118f\n"
- "115:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x22]\n"
- "b 118f\n"
- "116:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x9, #1, 117f\n"
- "str h16, [x27], #0x2\n"
- "str h20, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x22], #0x2\n"
- "tbz x9, #0, 118f\n"
- "st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x22]\n"
- "b 118f\n"
- "117:" // Height 4: Partial direct writeback: partial_1_0
- "str b16, [x27, #0x0]\n"
- "str b20, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x22, #0x0]\n"
- "118:" // Height 4: Partial direct writeback: Done
- "b 120f\n"
- "119:" // Height 4: Full writeback
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "str q20, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x22, #0x0]\n"
- "120:" // Height 4: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 92b\n"
- "subs %x[M], %x[M], #0x4\n"
- "beq 122f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 121f\n"
- "add x21, x21, #0x4\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "121:" // Update direct input
- "mov x20, #0x4\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "122:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16.hpp
deleted file mode 100644
index ee1297fc7b..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef __aarch64__
-
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<uint8_t>, \
- const Requantize32 *, const int32_t *, unsigned int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_hybrid_u8s8qa_mmla_4x16( ARGLIST );
-
-class cls_a64_hybrid_u8s8qa_mmla_4x16
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef uint8_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return 16;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 8;
- }
-
- static constexpr bool supports_accumulate()
- {
- return false;
- }
-
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 4, 16, 8> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 47.74 };
- case CPUModel::A510:
- return { 27.99 };
- case CPUModel::V1:
- return { 62.26 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_hybrid_u8s8qa_mmla_4x16;
- cls_a64_hybrid_u8s8qa_mmla_4x16(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp
deleted file mode 100644
index 00b9db05c0..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp
+++ /dev/null
@@ -1,2099 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_hybrid_u8s8qa_mmla_4x16 (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
- const Requantize32 *qp, const int32_t *col_bias, unsigned int
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- if (qp->c_offset > qp->minval) {
- flags |= 0x20;
- }
- __asm__ __volatile__(
- "1:" // Row loop
- "cmp %x[M], #0x4\n"
- "bge 97f\n"
- "cmp %x[M], #0x2\n"
- "bgt 65f\n"
- "beq 33f\n"
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v15.16b, #0x1\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "3:" // Height 1: setup done
- "mov x26, #0x0\n"
- "4:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 5f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "cbnz x26, 6f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "b 6f\n"
- "5:" // Height 1: setup direct input
- "mov x24, %x[input_ptr]\n"
- "6:" // Height 1: input setup done
- "cmp x25, #0x10\n"
- "blt 11f\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q5, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "ldr q4, [x28, #0x60]\n"
- "blt 9f\n"
- "7:" // Height 1: Multiply loop: Main loop head
- "add x24, x24, #0x10\n"
- "trn1 v0.2d, v1.2d, v27.2d\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- "ldr q25, [x28, #0x70]\n"
- "trn2 v1.2d, v1.2d, v27.2d\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- "ldr q24, [x28, #0x80]\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- "ldr q30, [x28, #0x90]\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- "ldr q29, [x28, #0xa0]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- "ldr q28, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- "ldr q27, [x28, #0xc0]\n"
- ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n"
- "ldr q26, [x28, #0xd0]\n"
- ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n"
- "ldr q25, [x28, #0xe0]\n"
- ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n"
- "ldr q24, [x28, #0xf0]\n"
- ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n"
- ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n"
- ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n"
- ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n"
- ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n"
- ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n"
- "tbnz %x[flags], #31, 8f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- "8:" // Height 1: Multiply loop: unique 1: skip row sum
- "ldr q1, [x24, #0x0]\n"
- "ldr q5, [x28, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "cmp x25, #0x20\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "ldr q4, [x28, #0x60]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "bge 7b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "trn1 v0.2d, v1.2d, v24.2d\n"
- "trn2 v1.2d, v1.2d, v24.2d\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- "ldr q25, [x28, #0x70]\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- "ldr q24, [x28, #0x80]\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- "ldr q30, [x28, #0x90]\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- "ldr q29, [x28, #0xa0]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- "ldr q28, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- "ldr q27, [x28, #0xc0]\n"
- ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n"
- "ldr q26, [x28, #0xd0]\n"
- ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n"
- "ldr q25, [x28, #0xe0]\n"
- ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n"
- "ldr q24, [x28, #0xf0]\n"
- ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n"
- ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n"
- ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n"
- ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n"
- ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n"
- ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n"
- "tbnz %x[flags], #31, 10f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- "10:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "11:" // Height 1: Multiply loop: Main loop skip
- "cbz x25, 20f\n"
- "cmp x25, #0x8\n"
- "blt 14f\n"
- "12:" // Height 1: Multiply loop: Odd block loop
- "ldr d25, [x24], #0x8\n"
- "trn1 v0.2d, v25.2d, v24.2d\n"
- "tbnz %x[flags], #31, 13f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "cmp x25, #0x8\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
- "ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n"
- "ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n"
- ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n"
- ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n"
- "bge 12b\n"
- "14:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x25, 20f\n"
- "tbz x25, #2, 16f\n"
- "ldr s1, [x24], #0x4\n"
- "tbz x25, #1, 15f\n"
- "ld1 { v1.h }[2], [x24], #0x2\n"
- "tbz x25, #0, 18f\n"
- "ld1 { v1.b }[6], [x24]\n"
- "b 18f\n"
- "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
- "tbz x25, #0, 18f\n"
- "ld1 { v1.b }[4], [x24]\n"
- "b 18f\n"
- "16:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
- "tbz x25, #1, 17f\n"
- "ldr h1, [x24], #0x2\n"
- "tbz x25, #0, 18f\n"
- "ld1 { v1.b }[2], [x24]\n"
- "b 18f\n"
- "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x24, #0x0]\n"
- "18:" // Height 1: Multiply loop: Ragged operand read: Done
- "trn1 v0.2d, v1.2d, v24.2d\n"
- "tbnz %x[flags], #31, 19f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "19:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
- "ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n"
- "ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n"
- ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n"
- ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n"
- "20:" // Height 1: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 4b\n"
- "uzp1 v16.2d, v16.2d, v20.2d\n"
- "uzp1 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "uzp1 v18.2d, v18.2d, v22.2d\n"
- "uzp1 v19.2d, v19.2d, v23.2d\n"
- "mov v23.16b, v16.16b\n"
- "tbnz %x[flags], #31, 21f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "ld1r { v16.4s }, [x20]\n"
- "neg v16.4s, v16.4s\n"
- "dup v11.4s, v11.s[0]\n"
- "mul v11.4s, v11.4s, v16.4s\n"
- "21:" // Height 1: skip row sum fixup
- "ldr q24, [x10, #0x0]\n"
- "ldr q22, [x10, #0x10]\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "ldr q21, [x10, #0x20]\n"
- "ldr q20, [x10, #0x30]\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "add v19.4s, v19.4s, v11.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v16.4s }, [x20]\n"
- "add v23.4s, v23.4s, v24.4s\n"
- "add v17.4s, v17.4s, v22.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add x10, x10, #0x40\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v18.4s, v18.4s, v21.4s\n"
- "add v19.4s, v19.4s, v20.4s\n"
- "sqrdmulh v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v17.4s, v17.4s, v16.4s\n"
- "sqrdmulh v18.4s, v18.4s, v16.4s\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "tbz %x[flags], #5, 22f\n"
- "and v22.16b, v23.16b, v0.16b\n"
- "and v21.16b, v17.16b, v0.16b\n"
- "and v20.16b, v18.16b, v0.16b\n"
- "and v16.16b, v19.16b, v0.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v22.4s\n"
- "sqadd v17.4s, v17.4s, v21.4s\n"
- "sqadd v18.4s, v18.4s, v20.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "22:" // Height 1: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v21.4s }, [x21]\n"
- "ld1r { v20.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v16.4s }, [x20]\n"
- "add v23.4s, v23.4s, v21.4s\n"
- "add v17.4s, v17.4s, v21.4s\n"
- "add v18.4s, v18.4s, v21.4s\n"
- "add v19.4s, v19.4s, v21.4s\n"
- "smin v23.4s, v23.4s, v20.4s\n"
- "smin v17.4s, v17.4s, v20.4s\n"
- "smin v18.4s, v18.4s, v20.4s\n"
- "smin v19.4s, v19.4s, v20.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v17.4s, v17.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smax v19.4s, v19.4s, v16.4s\n"
- "uzp1 v23.8h, v23.8h, v17.8h\n"
- "uzp1 v16.8h, v18.8h, v19.8h\n"
- "uzp1 v23.16b, v23.16b, v16.16b\n"
- "bge 31f\n"
- "tbz x9, #3, 26f\n"
- "str d23, [x27], #0x8\n"
- "tbz x9, #2, 24f\n"
- "st1 { v23.s }[2], [x27], #0x4\n"
- "tbz x9, #1, 23f\n"
- "st1 { v23.h }[6], [x27], #0x2\n"
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[14], [x27]\n"
- "b 30f\n"
- "23:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[12], [x27]\n"
- "b 30f\n"
- "24:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x9, #1, 25f\n"
- "st1 { v23.h }[4], [x27], #0x2\n"
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[10], [x27]\n"
- "b 30f\n"
- "25:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[8], [x27]\n"
- "b 30f\n"
- "26:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x9, #2, 28f\n"
- "str s23, [x27], #0x4\n"
- "tbz x9, #1, 27f\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[6], [x27]\n"
- "b 30f\n"
- "27:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[4], [x27]\n"
- "b 30f\n"
- "28:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x9, #1, 29f\n"
- "str h23, [x27], #0x2\n"
- "tbz x9, #0, 30f\n"
- "st1 { v23.b }[2], [x27]\n"
- "b 30f\n"
- "29:" // Height 1: Partial direct writeback: partial_1_0
- "str b23, [x27, #0x0]\n"
- "30:" // Height 1: Partial direct writeback: Done
- "b 32f\n"
- "31:" // Height 1: Full writeback
- "str q23, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "32:" // Height 1: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 2b\n"
- "b 130f\n"
- "33:" // Height 2
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "movi v15.16b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "34:" // Height 2: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "35:" // Height 2: setup done
- "mov x26, #0x0\n"
- "36:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 37f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "cbnz x26, 38f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 38f\n"
- "37:" // Height 2: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "38:" // Height 2: input setup done
- "cmp x25, #0x10\n"
- "blt 43f\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q5, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "ldr q4, [x28, #0x60]\n"
- "blt 41f\n"
- "39:" // Height 2: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- "ldr q25, [x28, #0x70]\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- "ldr q24, [x28, #0x80]\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- "ldr q30, [x28, #0x90]\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- "ldr q29, [x28, #0xa0]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- "ldr q28, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- "ldr q27, [x28, #0xc0]\n"
- ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n"
- "ldr q26, [x28, #0xd0]\n"
- ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n"
- "ldr q25, [x28, #0xe0]\n"
- ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n"
- "ldr q24, [x28, #0xf0]\n"
- ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n"
- ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n"
- ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n"
- ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n"
- ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n"
- ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n"
- "tbnz %x[flags], #31, 40f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- "40:" // Height 2: Multiply loop: unique 5: skip row sum
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q5, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "cmp x25, #0x20\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "ldr q4, [x28, #0x60]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "bge 39b\n"
- "41:" // Height 2: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- "ldr q25, [x28, #0x70]\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- "ldr q24, [x28, #0x80]\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- "ldr q30, [x28, #0x90]\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- "ldr q29, [x28, #0xa0]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- "ldr q28, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- "ldr q27, [x28, #0xc0]\n"
- ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n"
- "ldr q26, [x28, #0xd0]\n"
- ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n"
- "ldr q25, [x28, #0xe0]\n"
- ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n"
- "ldr q24, [x28, #0xf0]\n"
- ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n"
- ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n"
- ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n"
- ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n"
- ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n"
- ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n"
- "tbnz %x[flags], #31, 42f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- "42:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "43:" // Height 2: Multiply loop: Main loop skip
- "cbz x25, 52f\n"
- "cmp x25, #0x8\n"
- "blt 46f\n"
- "44:" // Height 2: Multiply loop: Odd block loop
- "ldr d25, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "trn1 v0.2d, v25.2d, v24.2d\n"
- "tbnz %x[flags], #31, 45f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "45:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "cmp x25, #0x8\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
- "ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n"
- "ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n"
- ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n"
- ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n"
- "bge 44b\n"
- "46:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x25, 52f\n"
- "tbz x25, #2, 48f\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x23], #0x4\n"
- "tbz x25, #1, 47f\n"
- "ld1 { v1.h }[2], [x24], #0x2\n"
- "ld1 { v2.h }[2], [x23], #0x2\n"
- "tbz x25, #0, 50f\n"
- "ld1 { v1.b }[6], [x24]\n"
- "ld1 { v2.b }[6], [x23]\n"
- "b 50f\n"
- "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
- "tbz x25, #0, 50f\n"
- "ld1 { v1.b }[4], [x24]\n"
- "ld1 { v2.b }[4], [x23]\n"
- "b 50f\n"
- "48:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
- "tbz x25, #1, 49f\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x23], #0x2\n"
- "tbz x25, #0, 50f\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x23]\n"
- "b 50f\n"
- "49:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x23, #0x0]\n"
- "50:" // Height 2: Multiply loop: Ragged operand read: Done
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "tbnz %x[flags], #31, 51f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "51:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q24, [x28, #0x0]\n"
- "ldr q30, [x28, #0x10]\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q28, [x28, #0x30]\n"
- "ldr q27, [x28, #0x40]\n"
- "ldr q26, [x28, #0x50]\n"
- "ldr q25, [x28, #0x60]\n"
- ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n"
- "ldr q24, [x28, #0x70]\n"
- ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n"
- ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n"
- ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n"
- ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n"
- ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n"
- ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n"
- "52:" // Height 2: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 36b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v24.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "uzp1 v20.2d, v17.2d, v21.2d\n"
- "uzp2 v17.2d, v17.2d, v21.2d\n"
- "uzp1 v21.2d, v18.2d, v22.2d\n"
- "uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "uzp1 v22.2d, v19.2d, v23.2d\n"
- "uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "mov v23.16b, v24.16b\n"
- "tbnz %x[flags], #31, 53f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "ld1r { v24.4s }, [x20]\n"
- "neg v24.4s, v24.4s\n"
- "dup v12.4s, v11.s[3]\n"
- "dup v11.4s, v11.s[0]\n"
- "mul v11.4s, v11.4s, v24.4s\n"
- "mul v12.4s, v12.4s, v24.4s\n"
- "53:" // Height 2: skip row sum fixup
- "ldr q28, [x10, #0x0]\n"
- "ldr q27, [x10, #0x10]\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "ldr q26, [x10, #0x20]\n"
- "ldr q25, [x10, #0x30]\n"
- "add v21.4s, v21.4s, v11.4s\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "add v16.4s, v16.4s, v12.4s\n"
- "add v17.4s, v17.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v24.4s }, [x20]\n"
- "add v18.4s, v18.4s, v12.4s\n"
- "add v19.4s, v19.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add v23.4s, v23.4s, v28.4s\n"
- "add v20.4s, v20.4s, v27.4s\n"
- "add x10, x10, #0x40\n"
- "add v21.4s, v21.4s, v26.4s\n"
- "add v22.4s, v22.4s, v25.4s\n"
- "add v16.4s, v16.4s, v28.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v17.4s, v17.4s, v27.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v25.4s\n"
- "sqrdmulh v23.4s, v23.4s, v24.4s\n"
- "sqrdmulh v20.4s, v20.4s, v24.4s\n"
- "sqrdmulh v21.4s, v21.4s, v24.4s\n"
- "sqrdmulh v22.4s, v22.4s, v24.4s\n"
- "sqrdmulh v16.4s, v16.4s, v24.4s\n"
- "sqrdmulh v17.4s, v17.4s, v24.4s\n"
- "sqrdmulh v18.4s, v18.4s, v24.4s\n"
- "sqrdmulh v19.4s, v19.4s, v24.4s\n"
- "tbz %x[flags], #5, 54f\n"
- "and v24.16b, v23.16b, v0.16b\n"
- "and v30.16b, v20.16b, v0.16b\n"
- "and v29.16b, v21.16b, v0.16b\n"
- "and v28.16b, v22.16b, v0.16b\n"
- "and v27.16b, v16.16b, v0.16b\n"
- "and v26.16b, v17.16b, v0.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "and v25.16b, v18.16b, v0.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v24.4s\n"
- "and v24.16b, v19.16b, v0.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v30.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v28.4s\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sqadd v17.4s, v17.4s, v26.4s\n"
- "sqadd v18.4s, v18.4s, v25.4s\n"
- "sqadd v19.4s, v19.4s, v24.4s\n"
- "54:" // Height 2: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v26.4s }, [x21]\n"
- "ld1r { v25.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v24.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "add v23.4s, v23.4s, v26.4s\n"
- "add v20.4s, v20.4s, v26.4s\n"
- "add v21.4s, v21.4s, v26.4s\n"
- "add v22.4s, v22.4s, v26.4s\n"
- "add v16.4s, v16.4s, v26.4s\n"
- "add v17.4s, v17.4s, v26.4s\n"
- "add v18.4s, v18.4s, v26.4s\n"
- "add v19.4s, v19.4s, v26.4s\n"
- "smin v23.4s, v23.4s, v25.4s\n"
- "smin v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v25.4s\n"
- "smin v22.4s, v22.4s, v25.4s\n"
- "smin v16.4s, v16.4s, v25.4s\n"
- "smin v17.4s, v17.4s, v25.4s\n"
- "smin v18.4s, v18.4s, v25.4s\n"
- "smin v19.4s, v19.4s, v25.4s\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "smax v22.4s, v22.4s, v24.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "smax v18.4s, v18.4s, v24.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "uzp1 v23.8h, v23.8h, v20.8h\n"
- "uzp1 v20.8h, v21.8h, v22.8h\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "bge 63f\n"
- "tbz x9, #3, 58f\n"
- "str d23, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "tbz x9, #2, 56f\n"
- "st1 { v23.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "tbz x9, #1, 55f\n"
- "st1 { v23.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "b 62f\n"
- "55:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "b 62f\n"
- "56:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x9, #1, 57f\n"
- "st1 { v23.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "b 62f\n"
- "57:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "b 62f\n"
- "58:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x9, #2, 60f\n"
- "str s23, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "tbz x9, #1, 59f\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "b 62f\n"
- "59:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "b 62f\n"
- "60:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x9, #1, 61f\n"
- "str h23, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "tbz x9, #0, 62f\n"
- "st1 { v23.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "b 62f\n"
- "61:" // Height 2: Partial direct writeback: partial_1_0
- "str b23, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "62:" // Height 2: Partial direct writeback: Done
- "b 64f\n"
- "63:" // Height 2: Full writeback
- "str q23, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "64:" // Height 2: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 34b\n"
- "b 130f\n"
- "65:" // Height 3
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "movi v13.4s, #0x0\n"
- "movi v15.16b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "66:" // Height 3: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "67:" // Height 3: setup done
- "mov x26, #0x0\n"
- "68:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 69f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x26, 70f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 70f\n"
- "69:" // Height 3: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "70:" // Height 3: input setup done
- "cmp x25, #0x10\n"
- "blt 75f\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q3, [x22, #0x0]\n"
- "ldr q5, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "blt 73f\n"
- "71:" // Height 3: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q14, [x28, #0x60]\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
- ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
- ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e8eac13 // usmmla v19.4s, v0.16b, v14.16b\n"
- ".inst 0x4e8eac5b // usmmla v27.4s, v2.16b, v14.16b\n"
- "ldr q6, [x28, #0xd0]\n"
- ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e84ac30 // usmmla v16.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84ac78 // usmmla v24.4s, v3.16b, v4.16b\n"
- "ldr q4, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n"
- ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n"
- ".inst 0x4e88ac79 // usmmla v25.4s, v3.16b, v8.16b\n"
- ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e89ac7d // usmmla v29.4s, v3.16b, v9.16b\n"
- ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e8aac7a // usmmla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x4e86ac36 // usmmla v22.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac7e // usmmla v30.4s, v3.16b, v6.16b\n"
- ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x4e84ac37 // usmmla v23.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84ac7f // usmmla v31.4s, v3.16b, v4.16b\n"
- "tbnz %x[flags], #31, 72f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n"
- "72:" // Height 3: Multiply loop: unique 9: skip row sum
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q3, [x22, #0x0]\n"
- "ldr q5, [x28, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "bge 71b\n"
- "73:" // Height 3: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q14, [x28, #0x60]\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
- ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
- ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e8eac13 // usmmla v19.4s, v0.16b, v14.16b\n"
- ".inst 0x4e8eac5b // usmmla v27.4s, v2.16b, v14.16b\n"
- "ldr q6, [x28, #0xd0]\n"
- ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e84ac30 // usmmla v16.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84ac78 // usmmla v24.4s, v3.16b, v4.16b\n"
- "ldr q4, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n"
- ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n"
- ".inst 0x4e88ac79 // usmmla v25.4s, v3.16b, v8.16b\n"
- ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e89ac7d // usmmla v29.4s, v3.16b, v9.16b\n"
- ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e8aac7a // usmmla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x4e86ac36 // usmmla v22.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac7e // usmmla v30.4s, v3.16b, v6.16b\n"
- ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x4e84ac37 // usmmla v23.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84ac7f // usmmla v31.4s, v3.16b, v4.16b\n"
- "tbnz %x[flags], #31, 74f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n"
- "74:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "75:" // Height 3: Multiply loop: Main loop skip
- "cbz x25, 84f\n"
- "cmp x25, #0x8\n"
- "blt 78f\n"
- "76:" // Height 3: Multiply loop: Odd block loop
- "ldr d3, [x24], #0x8\n"
- "ldr d0, [x23], #0x8\n"
- "ldr d1, [x22], #0x8\n"
- "trn1 v0.2d, v3.2d, v0.2d\n"
- "trn1 v2.2d, v1.2d, v2.2d\n"
- "tbnz %x[flags], #31, 77f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "77:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- "cmp x25, #0x8\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q4, [x28, #0x50]\n"
- "ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n"
- ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n"
- ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n"
- ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n"
- "bge 76b\n"
- "78:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x25, 84f\n"
- "tbz x25, #2, 80f\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "tbz x25, #1, 79f\n"
- "ld1 { v1.h }[2], [x24], #0x2\n"
- "ld1 { v2.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "tbz x25, #0, 82f\n"
- "ld1 { v1.b }[6], [x24]\n"
- "ld1 { v2.b }[6], [x23]\n"
- "ld1 { v3.b }[6], [x22]\n"
- "b 82f\n"
- "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
- "tbz x25, #0, 82f\n"
- "ld1 { v1.b }[4], [x24]\n"
- "ld1 { v2.b }[4], [x23]\n"
- "ld1 { v3.b }[4], [x22]\n"
- "b 82f\n"
- "80:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
- "tbz x25, #1, 81f\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "tbz x25, #0, 82f\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x23]\n"
- "ld1 { v3.b }[2], [x22]\n"
- "b 82f\n"
- "81:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x23, #0x0]\n"
- "ldr b3, [x22, #0x0]\n"
- "82:" // Height 3: Multiply loop: Ragged operand read: Done
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "tbnz %x[flags], #31, 83f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "83:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q4, [x28, #0x50]\n"
- "ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n"
- ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n"
- ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n"
- ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n"
- "84:" // Height 3: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 68b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v0.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "uzp1 v20.2d, v17.2d, v21.2d\n"
- "uzp2 v17.2d, v17.2d, v21.2d\n"
- "uzp1 v21.2d, v18.2d, v22.2d\n"
- "uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
- "uzp1 v22.2d, v19.2d, v23.2d\n"
- "uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "uzp1 v24.2d, v24.2d, v28.2d\n"
- "uzp1 v25.2d, v25.2d, v29.2d\n"
- "uzp1 v26.2d, v26.2d, v30.2d\n"
- "uzp1 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v0.16b\n"
- "tbnz %x[flags], #31, 85f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v13.4s, v13.4s, v13.4s\n"
- "ld1r { v23.4s }, [x20]\n"
- "neg v23.4s, v23.4s\n"
- "dup v12.4s, v11.s[3]\n"
- "dup v11.4s, v11.s[0]\n"
- "dup v13.4s, v13.s[0]\n"
- "mul v11.4s, v11.4s, v23.4s\n"
- "mul v12.4s, v12.4s, v23.4s\n"
- "mul v13.4s, v13.4s, v23.4s\n"
- "85:" // Height 3: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q30, [x10, #0x10]\n"
- "add v31.4s, v31.4s, v11.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "ldr q29, [x10, #0x20]\n"
- "ldr q28, [x10, #0x30]\n"
- "add v21.4s, v21.4s, v11.4s\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "add v16.4s, v16.4s, v12.4s\n"
- "add v17.4s, v17.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v23.4s }, [x20]\n"
- "add v18.4s, v18.4s, v12.4s\n"
- "add v19.4s, v19.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add x10, x10, #0x40\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v31.4s, v31.4s, v0.4s\n"
- "add v20.4s, v20.4s, v30.4s\n"
- "add v21.4s, v21.4s, v29.4s\n"
- "add v22.4s, v22.4s, v28.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v30.4s\n"
- "add v18.4s, v18.4s, v29.4s\n"
- "add v19.4s, v19.4s, v28.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v27.4s, v27.4s, v28.4s\n"
- "sqrdmulh v31.4s, v31.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v21.4s, v21.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "sqrdmulh v16.4s, v16.4s, v23.4s\n"
- "sqrdmulh v17.4s, v17.4s, v23.4s\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqrdmulh v24.4s, v24.4s, v23.4s\n"
- "sqrdmulh v25.4s, v25.4s, v23.4s\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "sqrdmulh v27.4s, v27.4s, v23.4s\n"
- "tbz %x[flags], #5, 86f\n"
- "and v1.16b, v31.16b, v0.16b\n"
- "and v30.16b, v20.16b, v0.16b\n"
- "and v29.16b, v21.16b, v0.16b\n"
- "and v28.16b, v22.16b, v0.16b\n"
- "and v23.16b, v16.16b, v0.16b\n"
- "and v3.16b, v17.16b, v0.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v2.16b, v18.16b, v0.16b\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "sqadd v20.4s, v20.4s, v30.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "sqadd v22.4s, v22.4s, v28.4s\n"
- "sqadd v16.4s, v16.4s, v23.4s\n"
- "and v1.16b, v19.16b, v0.16b\n"
- "and v30.16b, v24.16b, v0.16b\n"
- "and v29.16b, v25.16b, v0.16b\n"
- "and v28.16b, v26.16b, v0.16b\n"
- "and v23.16b, v27.16b, v0.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "sqadd v24.4s, v24.4s, v30.4s\n"
- "sqadd v25.4s, v25.4s, v29.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v27.4s, v27.4s, v23.4s\n"
- "86:" // Height 3: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v31.4s, v31.4s, v0.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v29.4s }, [x21]\n"
- "ld1r { v28.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v23.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "srshl v24.4s, v24.4s, v0.4s\n"
- "srshl v25.4s, v25.4s, v0.4s\n"
- "srshl v26.4s, v26.4s, v0.4s\n"
- "srshl v27.4s, v27.4s, v0.4s\n"
- "add v31.4s, v31.4s, v29.4s\n"
- "add v20.4s, v20.4s, v29.4s\n"
- "add v21.4s, v21.4s, v29.4s\n"
- "add v22.4s, v22.4s, v29.4s\n"
- "add v16.4s, v16.4s, v29.4s\n"
- "add v17.4s, v17.4s, v29.4s\n"
- "add v18.4s, v18.4s, v29.4s\n"
- "add v19.4s, v19.4s, v29.4s\n"
- "add v24.4s, v24.4s, v29.4s\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v27.4s, v27.4s, v29.4s\n"
- "smin v31.4s, v31.4s, v28.4s\n"
- "smin v20.4s, v20.4s, v28.4s\n"
- "smin v21.4s, v21.4s, v28.4s\n"
- "smin v22.4s, v22.4s, v28.4s\n"
- "smin v16.4s, v16.4s, v28.4s\n"
- "smin v17.4s, v17.4s, v28.4s\n"
- "smin v18.4s, v18.4s, v28.4s\n"
- "smin v19.4s, v19.4s, v28.4s\n"
- "smin v24.4s, v24.4s, v28.4s\n"
- "smin v25.4s, v25.4s, v28.4s\n"
- "smin v26.4s, v26.4s, v28.4s\n"
- "smin v27.4s, v27.4s, v28.4s\n"
- "smax v31.4s, v31.4s, v23.4s\n"
- "smax v20.4s, v20.4s, v23.4s\n"
- "smax v21.4s, v21.4s, v23.4s\n"
- "smax v22.4s, v22.4s, v23.4s\n"
- "smax v16.4s, v16.4s, v23.4s\n"
- "smax v17.4s, v17.4s, v23.4s\n"
- "smax v18.4s, v18.4s, v23.4s\n"
- "smax v19.4s, v19.4s, v23.4s\n"
- "smax v24.4s, v24.4s, v23.4s\n"
- "smax v25.4s, v25.4s, v23.4s\n"
- "smax v26.4s, v26.4s, v23.4s\n"
- "smax v27.4s, v27.4s, v23.4s\n"
- "uzp1 v31.8h, v31.8h, v20.8h\n"
- "uzp1 v20.8h, v21.8h, v22.8h\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v18.8h, v18.8h, v19.8h\n"
- "uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v17.8h, v26.8h, v27.8h\n"
- "uzp1 v31.16b, v31.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v18.16b\n"
- "uzp1 v24.16b, v24.16b, v17.16b\n"
- "bge 95f\n"
- "tbz x9, #3, 90f\n"
- "str d31, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x9, #2, 88f\n"
- "st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x9, #1, 87f\n"
- "st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v24.b }[14], [x23]\n"
- "b 94f\n"
- "87:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v24.b }[12], [x23]\n"
- "b 94f\n"
- "88:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x9, #1, 89f\n"
- "st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v24.b }[10], [x23]\n"
- "b 94f\n"
- "89:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v24.b }[8], [x23]\n"
- "b 94f\n"
- "90:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x9, #2, 92f\n"
- "str s31, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x9, #1, 91f\n"
- "st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v24.b }[6], [x23]\n"
- "b 94f\n"
- "91:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v24.b }[4], [x23]\n"
- "b 94f\n"
- "92:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x9, #1, 93f\n"
- "str h31, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x9, #0, 94f\n"
- "st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v24.b }[2], [x23]\n"
- "b 94f\n"
- "93:" // Height 3: Partial direct writeback: partial_1_0
- "str b31, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "94:" // Height 3: Partial direct writeback: Done
- "b 96f\n"
- "95:" // Height 3: Full writeback
- "str q31, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "96:" // Height 3: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 66b\n"
- "b 130f\n"
- "97:" // Height 4
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x4\n"
- "mov x10, %x[col_bias]\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "movi v15.16b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "98:" // Height 4: Column loop
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "99:" // Height 4: setup done
- "mov x26, #0x0\n"
- "100:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 101f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "ldr x21, [x20, #0x18]\n"
- "cbnz x26, 102f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 102f\n"
- "101:" // Height 4: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "102:" // Height 4: input setup done
- "cmp x25, #0x10\n"
- "blt 107f\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q3, [x22, #0x0]\n"
- "ldr q4, [x21, #0x0]\n"
- "ldr q5, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "blt 105f\n"
- "103:" // Height 4: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q4, [x28, #0x60]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
- ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84ac5b // usmmla v27.4s, v2.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac78 // usmmla v24.4s, v3.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n"
- ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n"
- ".inst 0x4e88ac79 // usmmla v25.4s, v3.16b, v8.16b\n"
- ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e89ac7d // usmmla v29.4s, v3.16b, v9.16b\n"
- ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e8aac7a // usmmla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84ac7e // usmmla v30.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac7f // usmmla v31.4s, v3.16b, v6.16b\n"
- "tbnz %x[flags], #31, 104f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n"
- "104:" // Height 4: Multiply loop: unique 13: skip row sum
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "sub x25, x25, #0x10\n"
- "ldr q3, [x22, #0x0]\n"
- "ldr q4, [x21, #0x0]\n"
- "cmp x25, #0x20\n"
- "ldr q5, [x28, #0x0]\n"
- "ldr q6, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q8, [x28, #0x30]\n"
- "ldr q9, [x28, #0x40]\n"
- "ldr q10, [x28, #0x50]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "bge 103b\n"
- "105:" // Height 4: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q4, [x28, #0x60]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n"
- ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
- ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
- ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n"
- ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
- ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84ac5b // usmmla v27.4s, v2.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac78 // usmmla v24.4s, v3.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
- "add x28, x28, #0x100\n"
- ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n"
- ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n"
- ".inst 0x4e88ac79 // usmmla v25.4s, v3.16b, v8.16b\n"
- ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e89ac7d // usmmla v29.4s, v3.16b, v9.16b\n"
- ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e8aac7a // usmmla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84ac7e // usmmla v30.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac7f // usmmla v31.4s, v3.16b, v6.16b\n"
- "tbnz %x[flags], #31, 106f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n"
- "106:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "107:" // Height 4: Multiply loop: Main loop skip
- "cbz x25, 116f\n"
- "cmp x25, #0x8\n"
- "blt 110f\n"
- "108:" // Height 4: Multiply loop: Odd block loop
- "ldr d3, [x24], #0x8\n"
- "ldr d0, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "trn1 v0.2d, v3.2d, v0.2d\n"
- "trn1 v2.2d, v2.2d, v1.2d\n"
- "tbnz %x[flags], #31, 109f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "109:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "sub x25, x25, #0x8\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- "cmp x25, #0x8\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q4, [x28, #0x50]\n"
- "ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n"
- ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n"
- ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n"
- ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n"
- "bge 108b\n"
- "110:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x25, 116f\n"
- "tbz x25, #2, 112f\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s9, [x21], #0x4\n"
- "tbz x25, #1, 111f\n"
- "ld1 { v1.h }[2], [x24], #0x2\n"
- "ld1 { v2.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v9.h }[2], [x21], #0x2\n"
- "tbz x25, #0, 114f\n"
- "ld1 { v1.b }[6], [x24]\n"
- "ld1 { v2.b }[6], [x23]\n"
- "ld1 { v3.b }[6], [x22]\n"
- "ld1 { v9.b }[6], [x21]\n"
- "b 114f\n"
- "111:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
- "tbz x25, #0, 114f\n"
- "ld1 { v1.b }[4], [x24]\n"
- "ld1 { v2.b }[4], [x23]\n"
- "ld1 { v3.b }[4], [x22]\n"
- "ld1 { v9.b }[4], [x21]\n"
- "b 114f\n"
- "112:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
- "tbz x25, #1, 113f\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h9, [x21], #0x2\n"
- "tbz x25, #0, 114f\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x23]\n"
- "ld1 { v3.b }[2], [x22]\n"
- "ld1 { v9.b }[2], [x21]\n"
- "b 114f\n"
- "113:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x23, #0x0]\n"
- "ldr b3, [x22, #0x0]\n"
- "ldr b9, [x21, #0x0]\n"
- "114:" // Height 4: Multiply loop: Ragged operand read: Done
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v9.2d\n"
- "tbnz %x[flags], #31, 115f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "115:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q1, [x28, #0x0]\n"
- "ldr q8, [x28, #0x10]\n"
- "ldr q7, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q4, [x28, #0x50]\n"
- "ldr q3, [x28, #0x60]\n"
- ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x28, #0x70]\n"
- ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n"
- "add x28, x28, #0x80\n"
- ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n"
- ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n"
- ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n"
- ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n"
- ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n"
- "116:" // Height 4: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 100b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v0.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "uzp1 v20.2d, v17.2d, v21.2d\n"
- "uzp2 v17.2d, v17.2d, v21.2d\n"
- "uzp1 v21.2d, v18.2d, v22.2d\n"
- "uzp2 v18.2d, v18.2d, v22.2d\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "uzp1 v22.2d, v19.2d, v23.2d\n"
- "uzp2 v19.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "uzp1 v23.2d, v24.2d, v28.2d\n"
- "uzp2 v24.2d, v24.2d, v28.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "uzp1 v28.2d, v25.2d, v29.2d\n"
- "uzp2 v25.2d, v25.2d, v29.2d\n"
- "uzp1 v29.2d, v26.2d, v30.2d\n"
- "uzp2 v26.2d, v26.2d, v30.2d\n"
- "uzp1 v30.2d, v27.2d, v31.2d\n"
- "uzp2 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v0.16b\n"
- "tbnz %x[flags], #31, 117f\n"
- "add x20, %x[qp], %[b_offset]\n"
- "addp v11.4s, v11.4s, v11.4s\n"
- "addp v13.4s, v13.4s, v13.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "neg v0.4s, v0.4s\n"
- "dup v12.4s, v11.s[3]\n"
- "dup v11.4s, v11.s[0]\n"
- "dup v14.4s, v13.s[3]\n"
- "dup v13.4s, v13.s[0]\n"
- "mul v11.4s, v11.4s, v0.4s\n"
- "mul v12.4s, v12.4s, v0.4s\n"
- "mul v14.4s, v14.4s, v0.4s\n"
- "mul v13.4s, v13.4s, v0.4s\n"
- "117:" // Height 4: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q4, [x10, #0x10]\n"
- "add v31.4s, v31.4s, v11.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "ldr q3, [x10, #0x20]\n"
- "ldr q2, [x10, #0x30]\n"
- "add v21.4s, v21.4s, v11.4s\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "add v16.4s, v16.4s, v12.4s\n"
- "add v17.4s, v17.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "ld1r { v1.4s }, [x20]\n"
- "add v18.4s, v18.4s, v12.4s\n"
- "add v19.4s, v19.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add v23.4s, v23.4s, v13.4s\n"
- "add v28.4s, v28.4s, v13.4s\n"
- "add x10, x10, #0x40\n"
- "add v29.4s, v29.4s, v13.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v31.4s, v31.4s, v0.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v3.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v3.4s\n"
- "add v19.4s, v19.4s, v2.4s\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v3.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v3.4s\n"
- "add v27.4s, v27.4s, v2.4s\n"
- "sqrdmulh v31.4s, v31.4s, v1.4s\n"
- "sqrdmulh v20.4s, v20.4s, v1.4s\n"
- "sqrdmulh v21.4s, v21.4s, v1.4s\n"
- "sqrdmulh v22.4s, v22.4s, v1.4s\n"
- "sqrdmulh v16.4s, v16.4s, v1.4s\n"
- "sqrdmulh v17.4s, v17.4s, v1.4s\n"
- "sqrdmulh v18.4s, v18.4s, v1.4s\n"
- "sqrdmulh v19.4s, v19.4s, v1.4s\n"
- "sqrdmulh v23.4s, v23.4s, v1.4s\n"
- "sqrdmulh v28.4s, v28.4s, v1.4s\n"
- "sqrdmulh v29.4s, v29.4s, v1.4s\n"
- "sqrdmulh v30.4s, v30.4s, v1.4s\n"
- "sqrdmulh v24.4s, v24.4s, v1.4s\n"
- "sqrdmulh v25.4s, v25.4s, v1.4s\n"
- "sqrdmulh v26.4s, v26.4s, v1.4s\n"
- "sqrdmulh v27.4s, v27.4s, v1.4s\n"
- "tbz %x[flags], #5, 118f\n"
- "and v2.16b, v31.16b, v0.16b\n"
- "and v1.16b, v20.16b, v0.16b\n"
- "and v7.16b, v21.16b, v0.16b\n"
- "and v6.16b, v22.16b, v0.16b\n"
- "and v5.16b, v16.16b, v0.16b\n"
- "and v4.16b, v17.16b, v0.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v3.16b, v18.16b, v0.16b\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v2.4s\n"
- "sqadd v20.4s, v20.4s, v1.4s\n"
- "and v2.16b, v19.16b, v0.16b\n"
- "and v1.16b, v23.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v7.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "and v7.16b, v28.16b, v0.16b\n"
- "sqadd v19.4s, v19.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v1.4s\n"
- "and v6.16b, v29.16b, v0.16b\n"
- "and v5.16b, v30.16b, v0.16b\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v3.16b, v25.16b, v0.16b\n"
- "and v2.16b, v26.16b, v0.16b\n"
- "and v1.16b, v27.16b, v0.16b\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v7.4s\n"
- "sqadd v29.4s, v29.4s, v6.4s\n"
- "sqadd v30.4s, v30.4s, v5.4s\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v3.4s\n"
- "sqadd v26.4s, v26.4s, v2.4s\n"
- "sqadd v27.4s, v27.4s, v1.4s\n"
- "118:" // Height 4: no shift correction
- "add x21, %x[qp], %[c_offset]\n"
- "srshl v31.4s, v31.4s, v0.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1r { v3.4s }, [x21]\n"
- "ld1r { v2.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "add x20, %x[qp], %[minval]\n"
- "cmp x9, #0x10\n"
- "ld1r { v1.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v0.4s\n"
- "srshl v19.4s, v19.4s, v0.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "srshl v28.4s, v28.4s, v0.4s\n"
- "srshl v29.4s, v29.4s, v0.4s\n"
- "srshl v30.4s, v30.4s, v0.4s\n"
- "srshl v24.4s, v24.4s, v0.4s\n"
- "srshl v25.4s, v25.4s, v0.4s\n"
- "srshl v26.4s, v26.4s, v0.4s\n"
- "srshl v27.4s, v27.4s, v0.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
- "add v20.4s, v20.4s, v3.4s\n"
- "add v21.4s, v21.4s, v3.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v16.4s, v16.4s, v3.4s\n"
- "add v17.4s, v17.4s, v3.4s\n"
- "add v18.4s, v18.4s, v3.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v28.4s, v28.4s, v3.4s\n"
- "add v29.4s, v29.4s, v3.4s\n"
- "add v30.4s, v30.4s, v3.4s\n"
- "add v24.4s, v24.4s, v3.4s\n"
- "add v25.4s, v25.4s, v3.4s\n"
- "add v26.4s, v26.4s, v3.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "smin v31.4s, v31.4s, v2.4s\n"
- "smin v20.4s, v20.4s, v2.4s\n"
- "smin v21.4s, v21.4s, v2.4s\n"
- "smin v22.4s, v22.4s, v2.4s\n"
- "smin v16.4s, v16.4s, v2.4s\n"
- "smin v17.4s, v17.4s, v2.4s\n"
- "smin v18.4s, v18.4s, v2.4s\n"
- "smin v19.4s, v19.4s, v2.4s\n"
- "smin v23.4s, v23.4s, v2.4s\n"
- "smin v28.4s, v28.4s, v2.4s\n"
- "smin v29.4s, v29.4s, v2.4s\n"
- "smin v30.4s, v30.4s, v2.4s\n"
- "smin v24.4s, v24.4s, v2.4s\n"
- "smin v25.4s, v25.4s, v2.4s\n"
- "smin v26.4s, v26.4s, v2.4s\n"
- "smin v27.4s, v27.4s, v2.4s\n"
- "smax v31.4s, v31.4s, v1.4s\n"
- "smax v20.4s, v20.4s, v1.4s\n"
- "smax v21.4s, v21.4s, v1.4s\n"
- "smax v22.4s, v22.4s, v1.4s\n"
- "smax v16.4s, v16.4s, v1.4s\n"
- "smax v17.4s, v17.4s, v1.4s\n"
- "smax v18.4s, v18.4s, v1.4s\n"
- "smax v19.4s, v19.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v1.4s\n"
- "smax v28.4s, v28.4s, v1.4s\n"
- "smax v29.4s, v29.4s, v1.4s\n"
- "smax v30.4s, v30.4s, v1.4s\n"
- "smax v24.4s, v24.4s, v1.4s\n"
- "smax v25.4s, v25.4s, v1.4s\n"
- "smax v26.4s, v26.4s, v1.4s\n"
- "smax v27.4s, v27.4s, v1.4s\n"
- "uzp1 v31.8h, v31.8h, v20.8h\n"
- "uzp1 v20.8h, v21.8h, v22.8h\n"
- "uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v19.8h, v18.8h, v19.8h\n"
- "uzp1 v23.8h, v23.8h, v28.8h\n"
- "uzp1 v18.8h, v29.8h, v30.8h\n"
- "uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v17.8h, v26.8h, v27.8h\n"
- "uzp1 v31.16b, v31.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v19.16b\n"
- "uzp1 v23.16b, v23.16b, v18.16b\n"
- "uzp1 v24.16b, v24.16b, v17.16b\n"
- "bge 127f\n"
- "tbz x9, #3, 122f\n"
- "str d31, [x27], #0x8\n"
- "str d16, [x24], #0x8\n"
- "str d23, [x23], #0x8\n"
- "str d24, [x22], #0x8\n"
- "tbz x9, #2, 120f\n"
- "st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x24], #0x4\n"
- "st1 { v23.s }[2], [x23], #0x4\n"
- "st1 { v24.s }[2], [x22], #0x4\n"
- "tbz x9, #1, 119f\n"
- "st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x24], #0x2\n"
- "st1 { v23.h }[6], [x23], #0x2\n"
- "st1 { v24.h }[6], [x22], #0x2\n"
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x24]\n"
- "st1 { v23.b }[14], [x23]\n"
- "st1 { v24.b }[14], [x22]\n"
- "b 126f\n"
- "119:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x24]\n"
- "st1 { v23.b }[12], [x23]\n"
- "st1 { v24.b }[12], [x22]\n"
- "b 126f\n"
- "120:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x9, #1, 121f\n"
- "st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x24], #0x2\n"
- "st1 { v23.h }[4], [x23], #0x2\n"
- "st1 { v24.h }[4], [x22], #0x2\n"
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x24]\n"
- "st1 { v23.b }[10], [x23]\n"
- "st1 { v24.b }[10], [x22]\n"
- "b 126f\n"
- "121:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x24]\n"
- "st1 { v23.b }[8], [x23]\n"
- "st1 { v24.b }[8], [x22]\n"
- "b 126f\n"
- "122:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x9, #2, 124f\n"
- "str s31, [x27], #0x4\n"
- "str s16, [x24], #0x4\n"
- "str s23, [x23], #0x4\n"
- "str s24, [x22], #0x4\n"
- "tbz x9, #1, 123f\n"
- "st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x24], #0x2\n"
- "st1 { v23.h }[2], [x23], #0x2\n"
- "st1 { v24.h }[2], [x22], #0x2\n"
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x24]\n"
- "st1 { v23.b }[6], [x23]\n"
- "st1 { v24.b }[6], [x22]\n"
- "b 126f\n"
- "123:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x24]\n"
- "st1 { v23.b }[4], [x23]\n"
- "st1 { v24.b }[4], [x22]\n"
- "b 126f\n"
- "124:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x9, #1, 125f\n"
- "str h31, [x27], #0x2\n"
- "str h16, [x24], #0x2\n"
- "str h23, [x23], #0x2\n"
- "str h24, [x22], #0x2\n"
- "tbz x9, #0, 126f\n"
- "st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x24]\n"
- "st1 { v23.b }[2], [x23]\n"
- "st1 { v24.b }[2], [x22]\n"
- "b 126f\n"
- "125:" // Height 4: Partial direct writeback: partial_1_0
- "str b31, [x27, #0x0]\n"
- "str b16, [x24, #0x0]\n"
- "str b23, [x23, #0x0]\n"
- "str b24, [x22, #0x0]\n"
- "126:" // Height 4: Partial direct writeback: Done
- "b 128f\n"
- "127:" // Height 4: Full writeback
- "str q31, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "str q16, [x24, #0x0]\n"
- "str q23, [x23, #0x0]\n"
- "str q24, [x22, #0x0]\n"
- "128:" // Height 4: Writeback done
- "subs x9, x9, #0x10\n"
- "bgt 98b\n"
- "subs %x[M], %x[M], #0x4\n"
- "beq 130f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 129f\n"
- "add x21, x21, #0x4\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "129:" // Update direct input
- "mov x20, #0x4\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "130:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16.hpp
deleted file mode 100644
index 4f963124cb..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef __aarch64__
-
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<int32_t>, \
- const int32_t *, Activation, bool
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_hybrid_u8s8s32_dot_6x16( ARGLIST );
-
-class cls_a64_hybrid_u8s8s32_dot_6x16
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef int32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 6;
- }
-
- static unsigned int out_width()
- {
- return 16;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint32_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 31.63 };
- case CPUModel::A510:
- return { 15.89 };
- case CPUModel::V1:
- return { 53.87 };
- case CPUModel::A55r1:
- return { 9.217 };
- }
- }
-
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 9.5238, 2.0799, 0.2279 };
- default:
- return { 29.6736, 11.4025, 0.5591 };
- case CPUModel::A510:
- return { 16.65, 3.92, 0.48 };
- case CPUModel::V1:
- return { 42.62, 16.32, 0.83 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_hybrid_u8s8s32_dot_6x16;
- cls_a64_hybrid_u8s8s32_dot_6x16(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16/generic.cpp
deleted file mode 100644
index 074a9585d8..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_dot_6x16/generic.cpp
+++ /dev/null
@@ -1,3264 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-
-namespace arm_gemm {
-
-void a64_hybrid_u8s8s32_dot_6x16 (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
- const int32_t *, Activation, bool accumulate
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- if (accumulate) {
- flags |= 0x1;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- __asm__ __volatile__(
- "1:" // Row loop
- "cmp %x[M], #0x6\n"
- "bge 171f\n"
- "cmp %x[M], #0x4\n"
- "bgt 137f\n"
- "beq 103f\n"
- "cmp %x[M], #0x2\n"
- "bgt 69f\n"
- "beq 35f\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "tbz %x[flags], #0, 12f\n"
- "cmp x11, #0x10\n"
- "bge 11f\n"
- "tbz x11, #3, 6f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "tbz x11, #2, 4f\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "tbz x11, #1, 3f\n"
- "ldr d11, [x9], #0x8\n"
- "mov x25, #0x38\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "b 10f\n"
- "3:" // Height 1: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 10f\n"
- "ldr s11, [x9, #0x0]\n"
- "b 10f\n"
- "4:" // Height 1: Partial accumulate: partial_2_8
- "tbz x11, #1, 5f\n"
- "ldr d10, [x9], #0x8\n"
- "mov x25, #0x28\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "b 10f\n"
- "5:" // Height 1: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 10f\n"
- "ldr s10, [x9, #0x0]\n"
- "b 10f\n"
- "6:" // Height 1: Partial accumulate: partial_4_0
- "tbz x11, #2, 8f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "tbz x11, #1, 7f\n"
- "ldr d9, [x9], #0x8\n"
- "mov x25, #0x18\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "b 10f\n"
- "7:" // Height 1: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 10f\n"
- "ldr s9, [x9, #0x0]\n"
- "b 10f\n"
- "8:" // Height 1: Partial accumulate: partial_2_0
- "tbz x11, #1, 9f\n"
- "ldr d8, [x9], #0x8\n"
- "mov x25, #0x8\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v8.s }[2], [x9]\n"
- "b 10f\n"
- "9:" // Height 1: Partial accumulate: partial_1_0
- "ldr s8, [x9, #0x0]\n"
- "mov x25, #0x0\n"
- "10:" // Height 1: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 13f\n"
- "11:" // Height 1: full accumulate
- "ldr q8, [x9, #0x0]\n"
- "ldr q9, [x9, #0x10]\n"
- "ldr q10, [x9, #0x20]\n"
- "ldr q11, [x9, #0x30]\n"
- "b 13f\n"
- "12:" // Height 1: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "13:" // Height 1: setup done
- "mov x28, #0x0\n"
- "14:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 15f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 16f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "b 16f\n"
- "15:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "16:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "blt 19f\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q7, [x10, #0x10]\n"
- "blt 18f\n"
- "17:" // Height 1: Multiply loop: Main loop head
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f00f22a // sudot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
- ".inst 0x4f00f20b // sudot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
- ".inst 0x4f20f228 // sudot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
- ".inst 0x4f20f209 // sudot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
- ".inst 0x4f20f22a // sudot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
- ".inst 0x4f20f20b // sudot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
- ".inst 0x4f00fa28 // sudot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
- ".inst 0x4f00fa09 // sudot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
- ".inst 0x4f00fa2a // sudot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
- ".inst 0x4f00fa0b // sudot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
- ".inst 0x4f20fa28 // sudot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
- ".inst 0x4f20fa09 // sudot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fa2a // sudot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f20fa0b // sudot v11.4s, v16.16b, v0.4b[3]\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "bge 17b\n"
- "18:" // Height 1: Multiply loop: Single iteration only
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f00f22a // sudot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
- ".inst 0x4f00f20b // sudot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
- ".inst 0x4f20f228 // sudot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
- ".inst 0x4f20f209 // sudot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
- ".inst 0x4f20f22a // sudot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
- ".inst 0x4f20f20b // sudot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
- ".inst 0x4f00fa28 // sudot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
- ".inst 0x4f00fa09 // sudot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
- ".inst 0x4f00fa2a // sudot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
- ".inst 0x4f00fa0b // sudot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
- ".inst 0x4f20fa28 // sudot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
- ".inst 0x4f20fa09 // sudot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fa2a // sudot v10.4s, v17.16b, v0.4b[3]\n"
- ".inst 0x4f20fa0b // sudot v11.4s, v16.16b, v0.4b[3]\n"
- "19:" // Height 1: Multiply loop: Main loop skip
- "cbz x27, 24f\n"
- "cmp x27, #0x4\n"
- "blt 21f\n"
- "20:" // Height 1: Multiply loop: Odd block loop
- "ldr s18, [x26], #0x4\n"
- "ldr q17, [x10, #0x0]\n"
- "sub x27, x27, #0x4\n"
- "ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f12f228 // sudot v8.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4f12f209 // sudot v9.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f12f22a // sudot v10.4s, v17.16b, v18.4b[0]\n"
- ".inst 0x4f12f20b // sudot v11.4s, v16.16b, v18.4b[0]\n"
- "bge 20b\n"
- "21:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x27, 24f\n"
- "tbz x27, #1, 22f\n"
- "ldr h0, [x26], #0x2\n"
- "tbz x27, #0, 23f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "b 23f\n"
- "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
- ".inst 0x4f00f228 // sudot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4f00f209 // sudot v9.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f00f22a // sudot v10.4s, v17.16b, v0.4b[0]\n"
- ".inst 0x4f00f20b // sudot v11.4s, v16.16b, v0.4b[0]\n"
- "24:" // Height 1: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 14b\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "bge 33f\n"
- "tbz x11, #3, 28f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "tbz x11, #2, 26f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "tbz x11, #1, 25f\n"
- "str d11, [x9], #0x8\n"
- "tbz x11, #0, 32f\n"
- "st1 { v11.s }[2], [x9]\n"
- "b 32f\n"
- "25:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x11, #0, 32f\n"
- "str s11, [x9, #0x0]\n"
- "b 32f\n"
- "26:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x11, #1, 27f\n"
- "str d10, [x9], #0x8\n"
- "tbz x11, #0, 32f\n"
- "st1 { v10.s }[2], [x9]\n"
- "b 32f\n"
- "27:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x11, #0, 32f\n"
- "str s10, [x9, #0x0]\n"
- "b 32f\n"
- "28:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x11, #2, 30f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "tbz x11, #1, 29f\n"
- "str d9, [x9], #0x8\n"
- "tbz x11, #0, 32f\n"
- "st1 { v9.s }[2], [x9]\n"
- "b 32f\n"
- "29:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x11, #0, 32f\n"
- "str s9, [x9, #0x0]\n"
- "b 32f\n"
- "30:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x11, #1, 31f\n"
- "str d8, [x9], #0x8\n"
- "tbz x11, #0, 32f\n"
- "st1 { v8.s }[2], [x9]\n"
- "b 32f\n"
- "31:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "32:" // Height 1: Partial direct writeback: Done
- "b 34f\n"
- "33:" // Height 1: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "34:" // Height 1: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 2b\n"
- "b 206f\n"
- "35:" // Height 2
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "36:" // Height 2: Column loop
- "tbz %x[flags], #0, 46f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "bge 45f\n"
- "tbz x11, #3, 40f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "tbz x11, #2, 38f\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "tbz x11, #1, 37f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "tbz x11, #0, 44f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "b 44f\n"
- "37:" // Height 2: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 44f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "b 44f\n"
- "38:" // Height 2: Partial accumulate: partial_2_8
- "tbz x11, #1, 39f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "tbz x11, #0, 44f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "b 44f\n"
- "39:" // Height 2: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 44f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "b 44f\n"
- "40:" // Height 2: Partial accumulate: partial_4_0
- "tbz x11, #2, 42f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "tbz x11, #1, 41f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "tbz x11, #0, 44f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "b 44f\n"
- "41:" // Height 2: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 44f\n"
- "ldr s9, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "b 44f\n"
- "42:" // Height 2: Partial accumulate: partial_2_0
- "tbz x11, #1, 43f\n"
- "ldr d8, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "tbz x11, #0, 44f\n"
- "ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "b 44f\n"
- "43:" // Height 2: Partial accumulate: partial_1_0
- "ldr s8, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "44:" // Height 2: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 47f\n"
- "45:" // Height 2: full accumulate
- "ldr q8, [x9, #0x0]\n"
- "ldr q9, [x9, #0x10]\n"
- "ldr q10, [x9, #0x20]\n"
- "ldr q11, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "b 47f\n"
- "46:" // Height 2: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "47:" // Height 2: setup done
- "mov x28, #0x0\n"
- "48:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 49f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "cbnz x28, 50f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "b 50f\n"
- "49:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "50:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "blt 53f\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "blt 52f\n"
- "51:" // Height 2: Multiply loop: Main loop head
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- "sub x27, x27, #0x10\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f00f22a // sudot v10.4s, v17.16b, v0.4b[0]\n"
- ".inst 0x4f01f22e // sudot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f00f20b // sudot v11.4s, v16.16b, v0.4b[0]\n"
- ".inst 0x4f01f20f // sudot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
- ".inst 0x4f20f228 // sudot v8.4s, v17.16b, v0.4b[1]\n"
- ".inst 0x4f21f22c // sudot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
- ".inst 0x4f20f209 // sudot v9.4s, v16.16b, v0.4b[1]\n"
- ".inst 0x4f21f20d // sudot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
- ".inst 0x4f20f22a // sudot v10.4s, v17.16b, v0.4b[1]\n"
- ".inst 0x4f21f22e // sudot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
- ".inst 0x4f20f20b // sudot v11.4s, v16.16b, v0.4b[1]\n"
- ".inst 0x4f21f20f // sudot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
- ".inst 0x4f00fa28 // sudot v8.4s, v17.16b, v0.4b[2]\n"
- ".inst 0x4f01fa2c // sudot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
- ".inst 0x4f00fa09 // sudot v9.4s, v16.16b, v0.4b[2]\n"
- ".inst 0x4f01fa0d // sudot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
- ".inst 0x4f00fa2a // sudot v10.4s, v17.16b, v0.4b[2]\n"
- ".inst 0x4f01fa2e // sudot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
- ".inst 0x4f00fa0b // sudot v11.4s, v16.16b, v0.4b[2]\n"
- ".inst 0x4f01fa0f // sudot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
- ".inst 0x4f20fa28 // sudot v8.4s, v17.16b, v0.4b[3]\n"
- ".inst 0x4f21fa2c // sudot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
- ".inst 0x4f20fa09 // sudot v9.4s, v16.16b, v0.4b[3]\n"
- ".inst 0x4f21fa0d // sudot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fa2a // sudot v10.4s, v17.16b, v0.4b[3]\n"
- ".inst 0x4f21fa2e // sudot v14.4s, v17.16b, v1.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f20fa0b // sudot v11.4s, v16.16b, v0.4b[3]\n"
- "ldr q0, [x26, #0x0]\n"
- ".inst 0x4f21fa0f // sudot v15.4s, v16.16b, v1.4b[3]\n"
- "ldr q1, [x25, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "bge 51b\n"
- "52:" // Height 2: Multiply loop: Single iteration only
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f00f22a // sudot v10.4s, v17.16b, v0.4b[0]\n"
- ".inst 0x4f01f22e // sudot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x40]\n"
- ".inst 0x4f00f20b // sudot v11.4s, v16.16b, v0.4b[0]\n"
- ".inst 0x4f01f20f // sudot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x50]\n"
- ".inst 0x4f20f228 // sudot v8.4s, v17.16b, v0.4b[1]\n"
- ".inst 0x4f21f22c // sudot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x60]\n"
- ".inst 0x4f20f209 // sudot v9.4s, v16.16b, v0.4b[1]\n"
- ".inst 0x4f21f20d // sudot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x70]\n"
- ".inst 0x4f20f22a // sudot v10.4s, v17.16b, v0.4b[1]\n"
- ".inst 0x4f21f22e // sudot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x10, #0x80]\n"
- ".inst 0x4f20f20b // sudot v11.4s, v16.16b, v0.4b[1]\n"
- ".inst 0x4f21f20f // sudot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x10, #0x90]\n"
- ".inst 0x4f00fa28 // sudot v8.4s, v17.16b, v0.4b[2]\n"
- ".inst 0x4f01fa2c // sudot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xa0]\n"
- ".inst 0x4f00fa09 // sudot v9.4s, v16.16b, v0.4b[2]\n"
- ".inst 0x4f01fa0d // sudot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xb0]\n"
- ".inst 0x4f00fa2a // sudot v10.4s, v17.16b, v0.4b[2]\n"
- ".inst 0x4f01fa2e // sudot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x10, #0xc0]\n"
- ".inst 0x4f00fa0b // sudot v11.4s, v16.16b, v0.4b[2]\n"
- ".inst 0x4f01fa0f // sudot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x10, #0xd0]\n"
- ".inst 0x4f20fa28 // sudot v8.4s, v17.16b, v0.4b[3]\n"
- ".inst 0x4f21fa2c // sudot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x10, #0xe0]\n"
- ".inst 0x4f20fa09 // sudot v9.4s, v16.16b, v0.4b[3]\n"
- ".inst 0x4f21fa0d // sudot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fa2a // sudot v10.4s, v17.16b, v0.4b[3]\n"
- ".inst 0x4f21fa2e // sudot v14.4s, v17.16b, v1.4b[3]\n"
- ".inst 0x4f20fa0b // sudot v11.4s, v16.16b, v0.4b[3]\n"
- ".inst 0x4f21fa0f // sudot v15.4s, v16.16b, v1.4b[3]\n"
- "53:" // Height 2: Multiply loop: Main loop skip
- "cbz x27, 58f\n"
- "cmp x27, #0x4\n"
- "blt 55f\n"
- "54:" // Height 2: Multiply loop: Odd block loop
- "ldr s19, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
- "sub x27, x27, #0x4\n"
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f13f228 // sudot v8.4s, v17.16b, v19.4b[0]\n"
- ".inst 0x4f12f22c // sudot v12.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4f13f209 // sudot v9.4s, v16.16b, v19.4b[0]\n"
- ".inst 0x4f12f20d // sudot v13.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f13f22a // sudot v10.4s, v17.16b, v19.4b[0]\n"
- ".inst 0x4f12f22e // sudot v14.4s, v17.16b, v18.4b[0]\n"
- ".inst 0x4f13f20b // sudot v11.4s, v16.16b, v19.4b[0]\n"
- ".inst 0x4f12f20f // sudot v15.4s, v16.16b, v18.4b[0]\n"
- "bge 54b\n"
- "55:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x27, 58f\n"
- "tbz x27, #1, 56f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
- "tbz x27, #0, 57f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "b 57f\n"
- "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x25, #0x0]\n"
- "57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q17, [x10, #0x0]\n"
- "ldr q16, [x10, #0x10]\n"
- ".inst 0x4f00f228 // sudot v8.4s, v17.16b, v0.4b[0]\n"
- ".inst 0x4f01f22c // sudot v12.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4f00f209 // sudot v9.4s, v16.16b, v0.4b[0]\n"
- ".inst 0x4f01f20d // sudot v13.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f00f22a // sudot v10.4s, v17.16b, v0.4b[0]\n"
- ".inst 0x4f01f22e // sudot v14.4s, v17.16b, v1.4b[0]\n"
- ".inst 0x4f00f20b // sudot v11.4s, v16.16b, v0.4b[0]\n"
- ".inst 0x4f01f20f // sudot v15.4s, v16.16b, v1.4b[0]\n"
- "58:" // Height 2: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 48b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "bge 67f\n"
- "tbz x11, #3, 62f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v13.4s }, [x24], #0x10\n"
- "tbz x11, #2, 60f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "tbz x11, #1, 59f\n"
- "str d11, [x9], #0x8\n"
- "str d15, [x24], #0x8\n"
- "tbz x11, #0, 66f\n"
- "st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x24]\n"
- "b 66f\n"
- "59:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x11, #0, 66f\n"
- "str s11, [x9, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "b 66f\n"
- "60:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x11, #1, 61f\n"
- "str d10, [x9], #0x8\n"
- "str d14, [x24], #0x8\n"
- "tbz x11, #0, 66f\n"
- "st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x24]\n"
- "b 66f\n"
- "61:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x11, #0, 66f\n"
- "str s10, [x9, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "b 66f\n"
- "62:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x11, #2, 64f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "tbz x11, #1, 63f\n"
- "str d9, [x9], #0x8\n"
- "str d13, [x24], #0x8\n"
- "tbz x11, #0, 66f\n"
- "st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x24]\n"
- "b 66f\n"
- "63:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x11, #0, 66f\n"
- "str s9, [x9, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "b 66f\n"
- "64:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x11, #1, 65f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x24], #0x8\n"
- "tbz x11, #0, 66f\n"
- "st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x24]\n"
- "b 66f\n"
- "65:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "66:" // Height 2: Partial direct writeback: Done
- "b 68f\n"
- "67:" // Height 2: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q12, [x24, #0x0]\n"
- "str q13, [x24, #0x10]\n"
- "str q14, [x24, #0x20]\n"
- "str q15, [x24, #0x30]\n"
- "68:" // Height 2: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 36b\n"
- "b 206f\n"
- "69:" // Height 3
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "70:" // Height 3: Column loop
- "tbz %x[flags], #0, 80f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "bge 79f\n"
- "tbz x11, #3, 74f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "tbz x11, #2, 72f\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "tbz x11, #1, 71f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d19, [x23], #0x8\n"
- "tbz x11, #0, 78f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "b 78f\n"
- "71:" // Height 3: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 78f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "b 78f\n"
- "72:" // Height 3: Partial accumulate: partial_2_8
- "tbz x11, #1, 73f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d18, [x23], #0x8\n"
- "tbz x11, #0, 78f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "b 78f\n"
- "73:" // Height 3: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 78f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "b 78f\n"
- "74:" // Height 3: Partial accumulate: partial_4_0
- "tbz x11, #2, 76f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "tbz x11, #1, 75f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d17, [x23], #0x8\n"
- "tbz x11, #0, 78f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "b 78f\n"
- "75:" // Height 3: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 78f\n"
- "ldr s9, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s17, [x23, #0x0]\n"
- "b 78f\n"
- "76:" // Height 3: Partial accumulate: partial_2_0
- "tbz x11, #1, 77f\n"
- "ldr d8, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d16, [x23], #0x8\n"
- "tbz x11, #0, 78f\n"
- "ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v16.s }[2], [x23]\n"
- "b 78f\n"
- "77:" // Height 3: Partial accumulate: partial_1_0
- "ldr s8, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s16, [x23, #0x0]\n"
- "78:" // Height 3: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 81f\n"
- "79:" // Height 3: full accumulate
- "ldr q8, [x9, #0x0]\n"
- "ldr q9, [x9, #0x10]\n"
- "ldr q10, [x9, #0x20]\n"
- "ldr q11, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q16, [x23, #0x0]\n"
- "ldr q17, [x23, #0x10]\n"
- "ldr q18, [x23, #0x20]\n"
- "ldr q19, [x23, #0x30]\n"
- "b 81f\n"
- "80:" // Height 3: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "81:" // Height 3: setup done
- "mov x28, #0x0\n"
- "82:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 83f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "cbnz x28, 84f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "b 84f\n"
- "83:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "84:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "blt 87f\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q2, [x24, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "blt 86f\n"
- "85:" // Height 3: Multiply loop: Main loop head
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
- "add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f00f2aa // sudot v10.4s, v21.16b, v0.4b[0]\n"
- ".inst 0x4f01f2ae // sudot v14.4s, v21.16b, v1.4b[0]\n"
- ".inst 0x4f02f2b2 // sudot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f00f28b // sudot v11.4s, v20.16b, v0.4b[0]\n"
- ".inst 0x4f01f28f // sudot v15.4s, v20.16b, v1.4b[0]\n"
- ".inst 0x4f02f293 // sudot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x50]\n"
- ".inst 0x4f20f2a8 // sudot v8.4s, v21.16b, v0.4b[1]\n"
- ".inst 0x4f21f2ac // sudot v12.4s, v21.16b, v1.4b[1]\n"
- ".inst 0x4f22f2b0 // sudot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x60]\n"
- ".inst 0x4f20f289 // sudot v9.4s, v20.16b, v0.4b[1]\n"
- ".inst 0x4f21f28d // sudot v13.4s, v20.16b, v1.4b[1]\n"
- ".inst 0x4f22f291 // sudot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x70]\n"
- ".inst 0x4f20f2aa // sudot v10.4s, v21.16b, v0.4b[1]\n"
- ".inst 0x4f21f2ae // sudot v14.4s, v21.16b, v1.4b[1]\n"
- ".inst 0x4f22f2b2 // sudot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x80]\n"
- ".inst 0x4f20f28b // sudot v11.4s, v20.16b, v0.4b[1]\n"
- ".inst 0x4f21f28f // sudot v15.4s, v20.16b, v1.4b[1]\n"
- ".inst 0x4f22f293 // sudot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x90]\n"
- ".inst 0x4f00faa8 // sudot v8.4s, v21.16b, v0.4b[2]\n"
- ".inst 0x4f01faac // sudot v12.4s, v21.16b, v1.4b[2]\n"
- ".inst 0x4f02fab0 // sudot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xa0]\n"
- ".inst 0x4f00fa89 // sudot v9.4s, v20.16b, v0.4b[2]\n"
- ".inst 0x4f01fa8d // sudot v13.4s, v20.16b, v1.4b[2]\n"
- ".inst 0x4f02fa91 // sudot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xb0]\n"
- ".inst 0x4f00faaa // sudot v10.4s, v21.16b, v0.4b[2]\n"
- ".inst 0x4f01faae // sudot v14.4s, v21.16b, v1.4b[2]\n"
- ".inst 0x4f02fab2 // sudot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xc0]\n"
- ".inst 0x4f00fa8b // sudot v11.4s, v20.16b, v0.4b[2]\n"
- ".inst 0x4f01fa8f // sudot v15.4s, v20.16b, v1.4b[2]\n"
- ".inst 0x4f02fa93 // sudot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xd0]\n"
- ".inst 0x4f20faa8 // sudot v8.4s, v21.16b, v0.4b[3]\n"
- ".inst 0x4f21faac // sudot v12.4s, v21.16b, v1.4b[3]\n"
- ".inst 0x4f22fab0 // sudot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x10, #0xe0]\n"
- ".inst 0x4f20fa89 // sudot v9.4s, v20.16b, v0.4b[3]\n"
- ".inst 0x4f21fa8d // sudot v13.4s, v20.16b, v1.4b[3]\n"
- ".inst 0x4f22fa91 // sudot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20faaa // sudot v10.4s, v21.16b, v0.4b[3]\n"
- ".inst 0x4f21faae // sudot v14.4s, v21.16b, v1.4b[3]\n"
- ".inst 0x4f22fab2 // sudot v18.4s, v21.16b, v2.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f20fa8b // sudot v11.4s, v20.16b, v0.4b[3]\n"
- "ldr q0, [x26, #0x0]\n"
- ".inst 0x4f21fa8f // sudot v15.4s, v20.16b, v1.4b[3]\n"
- "ldr q1, [x25, #0x0]\n"
- ".inst 0x4f22fa93 // sudot v19.4s, v20.16b, v2.4b[3]\n"
- "ldr q2, [x24, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "bge 85b\n"
- "86:" // Height 3: Multiply loop: Single iteration only
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f00f2aa // sudot v10.4s, v21.16b, v0.4b[0]\n"
- ".inst 0x4f01f2ae // sudot v14.4s, v21.16b, v1.4b[0]\n"
- ".inst 0x4f02f2b2 // sudot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f00f28b // sudot v11.4s, v20.16b, v0.4b[0]\n"
- ".inst 0x4f01f28f // sudot v15.4s, v20.16b, v1.4b[0]\n"
- ".inst 0x4f02f293 // sudot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x50]\n"
- ".inst 0x4f20f2a8 // sudot v8.4s, v21.16b, v0.4b[1]\n"
- ".inst 0x4f21f2ac // sudot v12.4s, v21.16b, v1.4b[1]\n"
- ".inst 0x4f22f2b0 // sudot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x60]\n"
- ".inst 0x4f20f289 // sudot v9.4s, v20.16b, v0.4b[1]\n"
- ".inst 0x4f21f28d // sudot v13.4s, v20.16b, v1.4b[1]\n"
- ".inst 0x4f22f291 // sudot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x70]\n"
- ".inst 0x4f20f2aa // sudot v10.4s, v21.16b, v0.4b[1]\n"
- ".inst 0x4f21f2ae // sudot v14.4s, v21.16b, v1.4b[1]\n"
- ".inst 0x4f22f2b2 // sudot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x10, #0x80]\n"
- ".inst 0x4f20f28b // sudot v11.4s, v20.16b, v0.4b[1]\n"
- ".inst 0x4f21f28f // sudot v15.4s, v20.16b, v1.4b[1]\n"
- ".inst 0x4f22f293 // sudot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x10, #0x90]\n"
- ".inst 0x4f00faa8 // sudot v8.4s, v21.16b, v0.4b[2]\n"
- ".inst 0x4f01faac // sudot v12.4s, v21.16b, v1.4b[2]\n"
- ".inst 0x4f02fab0 // sudot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xa0]\n"
- ".inst 0x4f00fa89 // sudot v9.4s, v20.16b, v0.4b[2]\n"
- ".inst 0x4f01fa8d // sudot v13.4s, v20.16b, v1.4b[2]\n"
- ".inst 0x4f02fa91 // sudot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xb0]\n"
- ".inst 0x4f00faaa // sudot v10.4s, v21.16b, v0.4b[2]\n"
- ".inst 0x4f01faae // sudot v14.4s, v21.16b, v1.4b[2]\n"
- ".inst 0x4f02fab2 // sudot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x10, #0xc0]\n"
- ".inst 0x4f00fa8b // sudot v11.4s, v20.16b, v0.4b[2]\n"
- ".inst 0x4f01fa8f // sudot v15.4s, v20.16b, v1.4b[2]\n"
- ".inst 0x4f02fa93 // sudot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x10, #0xd0]\n"
- ".inst 0x4f20faa8 // sudot v8.4s, v21.16b, v0.4b[3]\n"
- ".inst 0x4f21faac // sudot v12.4s, v21.16b, v1.4b[3]\n"
- ".inst 0x4f22fab0 // sudot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x10, #0xe0]\n"
- ".inst 0x4f20fa89 // sudot v9.4s, v20.16b, v0.4b[3]\n"
- ".inst 0x4f21fa8d // sudot v13.4s, v20.16b, v1.4b[3]\n"
- ".inst 0x4f22fa91 // sudot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20faaa // sudot v10.4s, v21.16b, v0.4b[3]\n"
- ".inst 0x4f21faae // sudot v14.4s, v21.16b, v1.4b[3]\n"
- ".inst 0x4f22fab2 // sudot v18.4s, v21.16b, v2.4b[3]\n"
- ".inst 0x4f20fa8b // sudot v11.4s, v20.16b, v0.4b[3]\n"
- ".inst 0x4f21fa8f // sudot v15.4s, v20.16b, v1.4b[3]\n"
- ".inst 0x4f22fa93 // sudot v19.4s, v20.16b, v2.4b[3]\n"
- "87:" // Height 3: Multiply loop: Main loop skip
- "cbz x27, 92f\n"
- "cmp x27, #0x4\n"
- "blt 89f\n"
- "88:" // Height 3: Multiply loop: Odd block loop
- "ldr s24, [x26], #0x4\n"
- "ldr s23, [x25], #0x4\n"
- "sub x27, x27, #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr q21, [x10, #0x0]\n"
- "cmp x27, #0x4\n"
- "ldr q20, [x10, #0x10]\n"
- ".inst 0x4f18f2a8 // sudot v8.4s, v21.16b, v24.4b[0]\n"
- ".inst 0x4f17f2ac // sudot v12.4s, v21.16b, v23.4b[0]\n"
- ".inst 0x4f16f2b0 // sudot v16.4s, v21.16b, v22.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
- ".inst 0x4f18f289 // sudot v9.4s, v20.16b, v24.4b[0]\n"
- ".inst 0x4f17f28d // sudot v13.4s, v20.16b, v23.4b[0]\n"
- ".inst 0x4f16f291 // sudot v17.4s, v20.16b, v22.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f18f2aa // sudot v10.4s, v21.16b, v24.4b[0]\n"
- ".inst 0x4f17f2ae // sudot v14.4s, v21.16b, v23.4b[0]\n"
- ".inst 0x4f16f2b2 // sudot v18.4s, v21.16b, v22.4b[0]\n"
- ".inst 0x4f18f28b // sudot v11.4s, v20.16b, v24.4b[0]\n"
- ".inst 0x4f17f28f // sudot v15.4s, v20.16b, v23.4b[0]\n"
- ".inst 0x4f16f293 // sudot v19.4s, v20.16b, v22.4b[0]\n"
- "bge 88b\n"
- "89:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x27, 92f\n"
- "tbz x27, #1, 90f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
- "ldr h2, [x24], #0x2\n"
- "tbz x27, #0, 91f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "b 91f\n"
- "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x25, #0x0]\n"
- "ldr b2, [x24, #0x0]\n"
- "91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q21, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
- ".inst 0x4f00f2a8 // sudot v8.4s, v21.16b, v0.4b[0]\n"
- ".inst 0x4f01f2ac // sudot v12.4s, v21.16b, v1.4b[0]\n"
- ".inst 0x4f02f2b0 // sudot v16.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x10, #0x20]\n"
- ".inst 0x4f00f289 // sudot v9.4s, v20.16b, v0.4b[0]\n"
- ".inst 0x4f01f28d // sudot v13.4s, v20.16b, v1.4b[0]\n"
- ".inst 0x4f02f291 // sudot v17.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f00f2aa // sudot v10.4s, v21.16b, v0.4b[0]\n"
- ".inst 0x4f01f2ae // sudot v14.4s, v21.16b, v1.4b[0]\n"
- ".inst 0x4f02f2b2 // sudot v18.4s, v21.16b, v2.4b[0]\n"
- ".inst 0x4f00f28b // sudot v11.4s, v20.16b, v0.4b[0]\n"
- ".inst 0x4f01f28f // sudot v15.4s, v20.16b, v1.4b[0]\n"
- ".inst 0x4f02f293 // sudot v19.4s, v20.16b, v2.4b[0]\n"
- "92:" // Height 3: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 82b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "bge 101f\n"
- "tbz x11, #3, 96f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v13.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v17.4s }, [x23], #0x10\n"
- "tbz x11, #2, 94f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v18.4s }, [x23], #0x10\n"
- "tbz x11, #1, 93f\n"
- "str d11, [x9], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "tbz x11, #0, 100f\n"
- "st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "b 100f\n"
- "93:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x11, #0, 100f\n"
- "str s11, [x9, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "b 100f\n"
- "94:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x11, #1, 95f\n"
- "str d10, [x9], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d18, [x23], #0x8\n"
- "tbz x11, #0, 100f\n"
- "st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v18.s }[2], [x23]\n"
- "b 100f\n"
- "95:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x11, #0, 100f\n"
- "str s10, [x9, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s18, [x23, #0x0]\n"
- "b 100f\n"
- "96:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x11, #2, 98f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "tbz x11, #1, 97f\n"
- "str d9, [x9], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d17, [x23], #0x8\n"
- "tbz x11, #0, 100f\n"
- "st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v17.s }[2], [x23]\n"
- "b 100f\n"
- "97:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x11, #0, 100f\n"
- "str s9, [x9, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s17, [x23, #0x0]\n"
- "b 100f\n"
- "98:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x11, #1, 99f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "tbz x11, #0, 100f\n"
- "st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v16.s }[2], [x23]\n"
- "b 100f\n"
- "99:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s16, [x23, #0x0]\n"
- "100:" // Height 3: Partial direct writeback: Done
- "b 102f\n"
- "101:" // Height 3: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q12, [x24, #0x0]\n"
- "str q13, [x24, #0x10]\n"
- "str q14, [x24, #0x20]\n"
- "str q15, [x24, #0x30]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q18, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "102:" // Height 3: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 70b\n"
- "b 206f\n"
- "103:" // Height 4
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "104:" // Height 4: Column loop
- "tbz %x[flags], #0, 114f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "bge 113f\n"
- "tbz x11, #3, 108f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v21.4s }, [x22], #0x10\n"
- "tbz x11, #2, 106f\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "ld1 { v22.4s }, [x22], #0x10\n"
- "tbz x11, #1, 105f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d19, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "tbz x11, #0, 112f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "b 112f\n"
- "105:" // Height 4: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 112f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "b 112f\n"
- "106:" // Height 4: Partial accumulate: partial_2_8
- "tbz x11, #1, 107f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "tbz x11, #0, 112f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "ld1 { v22.s }[2], [x22]\n"
- "b 112f\n"
- "107:" // Height 4: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 112f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "ldr s22, [x22, #0x0]\n"
- "b 112f\n"
- "108:" // Height 4: Partial accumulate: partial_4_0
- "tbz x11, #2, 110f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "tbz x11, #1, 109f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "tbz x11, #0, 112f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "b 112f\n"
- "109:" // Height 4: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 112f\n"
- "ldr s9, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "b 112f\n"
- "110:" // Height 4: Partial accumulate: partial_2_0
- "tbz x11, #1, 111f\n"
- "ldr d8, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d16, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "tbz x11, #0, 112f\n"
- "ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v16.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "b 112f\n"
- "111:" // Height 4: Partial accumulate: partial_1_0
- "ldr s8, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s16, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "112:" // Height 4: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 115f\n"
- "113:" // Height 4: full accumulate
- "ldr q8, [x9, #0x0]\n"
- "ldr q9, [x9, #0x10]\n"
- "ldr q10, [x9, #0x20]\n"
- "ldr q11, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q16, [x23, #0x0]\n"
- "ldr q17, [x23, #0x10]\n"
- "ldr q18, [x23, #0x20]\n"
- "ldr q19, [x23, #0x30]\n"
- "ldr q20, [x22, #0x0]\n"
- "ldr q21, [x22, #0x10]\n"
- "ldr q22, [x22, #0x20]\n"
- "ldr q23, [x22, #0x30]\n"
- "b 115f\n"
- "114:" // Height 4: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "115:" // Height 4: setup done
- "mov x28, #0x0\n"
- "116:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 117f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "cbnz x28, 118f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 118f\n"
- "117:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "118:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "blt 121f\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q2, [x24, #0x0]\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "blt 120f\n"
- "119:" // Height 4: Multiply loop: Main loop head
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d4 // sudot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f5 // sudot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "cmp x27, #0x20\n"
- ".inst 0x4f00f32a // sudot v10.4s, v25.16b, v0.4b[0]\n"
- ".inst 0x4f01f32e // sudot v14.4s, v25.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f02f332 // sudot v18.4s, v25.16b, v2.4b[0]\n"
- ".inst 0x4f03f336 // sudot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f00f30b // sudot v11.4s, v24.16b, v0.4b[0]\n"
- ".inst 0x4f01f30f // sudot v15.4s, v24.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f02f313 // sudot v19.4s, v24.16b, v2.4b[0]\n"
- ".inst 0x4f03f317 // sudot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x50]\n"
- ".inst 0x4f20f328 // sudot v8.4s, v25.16b, v0.4b[1]\n"
- ".inst 0x4f21f32c // sudot v12.4s, v25.16b, v1.4b[1]\n"
- ".inst 0x4f22f330 // sudot v16.4s, v25.16b, v2.4b[1]\n"
- ".inst 0x4f23f334 // sudot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x60]\n"
- ".inst 0x4f20f309 // sudot v9.4s, v24.16b, v0.4b[1]\n"
- ".inst 0x4f21f30d // sudot v13.4s, v24.16b, v1.4b[1]\n"
- ".inst 0x4f22f311 // sudot v17.4s, v24.16b, v2.4b[1]\n"
- ".inst 0x4f23f315 // sudot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x70]\n"
- ".inst 0x4f20f32a // sudot v10.4s, v25.16b, v0.4b[1]\n"
- ".inst 0x4f21f32e // sudot v14.4s, v25.16b, v1.4b[1]\n"
- ".inst 0x4f22f332 // sudot v18.4s, v25.16b, v2.4b[1]\n"
- ".inst 0x4f23f336 // sudot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x80]\n"
- ".inst 0x4f20f30b // sudot v11.4s, v24.16b, v0.4b[1]\n"
- ".inst 0x4f21f30f // sudot v15.4s, v24.16b, v1.4b[1]\n"
- ".inst 0x4f22f313 // sudot v19.4s, v24.16b, v2.4b[1]\n"
- ".inst 0x4f23f317 // sudot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x90]\n"
- ".inst 0x4f00fb28 // sudot v8.4s, v25.16b, v0.4b[2]\n"
- ".inst 0x4f01fb2c // sudot v12.4s, v25.16b, v1.4b[2]\n"
- ".inst 0x4f02fb30 // sudot v16.4s, v25.16b, v2.4b[2]\n"
- ".inst 0x4f03fb34 // sudot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xa0]\n"
- ".inst 0x4f00fb09 // sudot v9.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f01fb0d // sudot v13.4s, v24.16b, v1.4b[2]\n"
- ".inst 0x4f02fb11 // sudot v17.4s, v24.16b, v2.4b[2]\n"
- ".inst 0x4f03fb15 // sudot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xb0]\n"
- ".inst 0x4f00fb2a // sudot v10.4s, v25.16b, v0.4b[2]\n"
- ".inst 0x4f01fb2e // sudot v14.4s, v25.16b, v1.4b[2]\n"
- ".inst 0x4f02fb32 // sudot v18.4s, v25.16b, v2.4b[2]\n"
- ".inst 0x4f03fb36 // sudot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xc0]\n"
- ".inst 0x4f00fb0b // sudot v11.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f01fb0f // sudot v15.4s, v24.16b, v1.4b[2]\n"
- ".inst 0x4f02fb13 // sudot v19.4s, v24.16b, v2.4b[2]\n"
- ".inst 0x4f03fb17 // sudot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xd0]\n"
- ".inst 0x4f20fb28 // sudot v8.4s, v25.16b, v0.4b[3]\n"
- ".inst 0x4f21fb2c // sudot v12.4s, v25.16b, v1.4b[3]\n"
- ".inst 0x4f22fb30 // sudot v16.4s, v25.16b, v2.4b[3]\n"
- ".inst 0x4f23fb34 // sudot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x10, #0xe0]\n"
- ".inst 0x4f20fb09 // sudot v9.4s, v24.16b, v0.4b[3]\n"
- ".inst 0x4f21fb0d // sudot v13.4s, v24.16b, v1.4b[3]\n"
- ".inst 0x4f22fb11 // sudot v17.4s, v24.16b, v2.4b[3]\n"
- ".inst 0x4f23fb15 // sudot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fb2a // sudot v10.4s, v25.16b, v0.4b[3]\n"
- ".inst 0x4f21fb2e // sudot v14.4s, v25.16b, v1.4b[3]\n"
- ".inst 0x4f22fb32 // sudot v18.4s, v25.16b, v2.4b[3]\n"
- ".inst 0x4f23fb36 // sudot v22.4s, v25.16b, v3.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f20fb0b // sudot v11.4s, v24.16b, v0.4b[3]\n"
- "ldr q0, [x26, #0x0]\n"
- ".inst 0x4f21fb0f // sudot v15.4s, v24.16b, v1.4b[3]\n"
- "ldr q1, [x25, #0x0]\n"
- ".inst 0x4f22fb13 // sudot v19.4s, v24.16b, v2.4b[3]\n"
- "ldr q2, [x24, #0x0]\n"
- ".inst 0x4f23fb17 // sudot v23.4s, v24.16b, v3.4b[3]\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "bge 119b\n"
- "120:" // Height 4: Multiply loop: Single iteration only
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d4 // sudot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f5 // sudot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f00f32a // sudot v10.4s, v25.16b, v0.4b[0]\n"
- ".inst 0x4f01f32e // sudot v14.4s, v25.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f02f332 // sudot v18.4s, v25.16b, v2.4b[0]\n"
- ".inst 0x4f03f336 // sudot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x40]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f00f30b // sudot v11.4s, v24.16b, v0.4b[0]\n"
- ".inst 0x4f01f30f // sudot v15.4s, v24.16b, v1.4b[0]\n"
- ".inst 0x4f02f313 // sudot v19.4s, v24.16b, v2.4b[0]\n"
- ".inst 0x4f03f317 // sudot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x50]\n"
- ".inst 0x4f20f328 // sudot v8.4s, v25.16b, v0.4b[1]\n"
- ".inst 0x4f21f32c // sudot v12.4s, v25.16b, v1.4b[1]\n"
- ".inst 0x4f22f330 // sudot v16.4s, v25.16b, v2.4b[1]\n"
- ".inst 0x4f23f334 // sudot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x60]\n"
- ".inst 0x4f20f309 // sudot v9.4s, v24.16b, v0.4b[1]\n"
- ".inst 0x4f21f30d // sudot v13.4s, v24.16b, v1.4b[1]\n"
- ".inst 0x4f22f311 // sudot v17.4s, v24.16b, v2.4b[1]\n"
- ".inst 0x4f23f315 // sudot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x70]\n"
- ".inst 0x4f20f32a // sudot v10.4s, v25.16b, v0.4b[1]\n"
- ".inst 0x4f21f32e // sudot v14.4s, v25.16b, v1.4b[1]\n"
- ".inst 0x4f22f332 // sudot v18.4s, v25.16b, v2.4b[1]\n"
- ".inst 0x4f23f336 // sudot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x10, #0x80]\n"
- ".inst 0x4f20f30b // sudot v11.4s, v24.16b, v0.4b[1]\n"
- ".inst 0x4f21f30f // sudot v15.4s, v24.16b, v1.4b[1]\n"
- ".inst 0x4f22f313 // sudot v19.4s, v24.16b, v2.4b[1]\n"
- ".inst 0x4f23f317 // sudot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x10, #0x90]\n"
- ".inst 0x4f00fb28 // sudot v8.4s, v25.16b, v0.4b[2]\n"
- ".inst 0x4f01fb2c // sudot v12.4s, v25.16b, v1.4b[2]\n"
- ".inst 0x4f02fb30 // sudot v16.4s, v25.16b, v2.4b[2]\n"
- ".inst 0x4f03fb34 // sudot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xa0]\n"
- ".inst 0x4f00fb09 // sudot v9.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f01fb0d // sudot v13.4s, v24.16b, v1.4b[2]\n"
- ".inst 0x4f02fb11 // sudot v17.4s, v24.16b, v2.4b[2]\n"
- ".inst 0x4f03fb15 // sudot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xb0]\n"
- ".inst 0x4f00fb2a // sudot v10.4s, v25.16b, v0.4b[2]\n"
- ".inst 0x4f01fb2e // sudot v14.4s, v25.16b, v1.4b[2]\n"
- ".inst 0x4f02fb32 // sudot v18.4s, v25.16b, v2.4b[2]\n"
- ".inst 0x4f03fb36 // sudot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x10, #0xc0]\n"
- ".inst 0x4f00fb0b // sudot v11.4s, v24.16b, v0.4b[2]\n"
- ".inst 0x4f01fb0f // sudot v15.4s, v24.16b, v1.4b[2]\n"
- ".inst 0x4f02fb13 // sudot v19.4s, v24.16b, v2.4b[2]\n"
- ".inst 0x4f03fb17 // sudot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x10, #0xd0]\n"
- ".inst 0x4f20fb28 // sudot v8.4s, v25.16b, v0.4b[3]\n"
- ".inst 0x4f21fb2c // sudot v12.4s, v25.16b, v1.4b[3]\n"
- ".inst 0x4f22fb30 // sudot v16.4s, v25.16b, v2.4b[3]\n"
- ".inst 0x4f23fb34 // sudot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x10, #0xe0]\n"
- ".inst 0x4f20fb09 // sudot v9.4s, v24.16b, v0.4b[3]\n"
- ".inst 0x4f21fb0d // sudot v13.4s, v24.16b, v1.4b[3]\n"
- ".inst 0x4f22fb11 // sudot v17.4s, v24.16b, v2.4b[3]\n"
- ".inst 0x4f23fb15 // sudot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fb2a // sudot v10.4s, v25.16b, v0.4b[3]\n"
- ".inst 0x4f21fb2e // sudot v14.4s, v25.16b, v1.4b[3]\n"
- ".inst 0x4f22fb32 // sudot v18.4s, v25.16b, v2.4b[3]\n"
- ".inst 0x4f23fb36 // sudot v22.4s, v25.16b, v3.4b[3]\n"
- ".inst 0x4f20fb0b // sudot v11.4s, v24.16b, v0.4b[3]\n"
- ".inst 0x4f21fb0f // sudot v15.4s, v24.16b, v1.4b[3]\n"
- ".inst 0x4f22fb13 // sudot v19.4s, v24.16b, v2.4b[3]\n"
- ".inst 0x4f23fb17 // sudot v23.4s, v24.16b, v3.4b[3]\n"
- "121:" // Height 4: Multiply loop: Main loop skip
- "cbz x27, 126f\n"
- "cmp x27, #0x4\n"
- "blt 123f\n"
- "122:" // Height 4: Multiply loop: Odd block loop
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "sub x27, x27, #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "cmp x27, #0x4\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q24, [x10, #0x10]\n"
- ".inst 0x4f1df328 // sudot v8.4s, v25.16b, v29.4b[0]\n"
- ".inst 0x4f1cf32c // sudot v12.4s, v25.16b, v28.4b[0]\n"
- ".inst 0x4f1bf330 // sudot v16.4s, v25.16b, v27.4b[0]\n"
- ".inst 0x4f1af334 // sudot v20.4s, v25.16b, v26.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
- ".inst 0x4f1df309 // sudot v9.4s, v24.16b, v29.4b[0]\n"
- ".inst 0x4f1cf30d // sudot v13.4s, v24.16b, v28.4b[0]\n"
- ".inst 0x4f1bf311 // sudot v17.4s, v24.16b, v27.4b[0]\n"
- ".inst 0x4f1af315 // sudot v21.4s, v24.16b, v26.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f1df32a // sudot v10.4s, v25.16b, v29.4b[0]\n"
- ".inst 0x4f1cf32e // sudot v14.4s, v25.16b, v28.4b[0]\n"
- ".inst 0x4f1bf332 // sudot v18.4s, v25.16b, v27.4b[0]\n"
- ".inst 0x4f1af336 // sudot v22.4s, v25.16b, v26.4b[0]\n"
- ".inst 0x4f1df30b // sudot v11.4s, v24.16b, v29.4b[0]\n"
- ".inst 0x4f1cf30f // sudot v15.4s, v24.16b, v28.4b[0]\n"
- ".inst 0x4f1bf313 // sudot v19.4s, v24.16b, v27.4b[0]\n"
- ".inst 0x4f1af317 // sudot v23.4s, v24.16b, v26.4b[0]\n"
- "bge 122b\n"
- "123:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x27, 126f\n"
- "tbz x27, #1, 124f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "tbz x27, #0, 125f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "ld1 { v3.b }[2], [x23]\n"
- "b 125f\n"
- "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x25, #0x0]\n"
- "ldr b2, [x24, #0x0]\n"
- "ldr b3, [x23, #0x0]\n"
- "125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q25, [x10, #0x0]\n"
- "ldr q24, [x10, #0x10]\n"
- ".inst 0x4f00f328 // sudot v8.4s, v25.16b, v0.4b[0]\n"
- ".inst 0x4f01f32c // sudot v12.4s, v25.16b, v1.4b[0]\n"
- ".inst 0x4f02f330 // sudot v16.4s, v25.16b, v2.4b[0]\n"
- ".inst 0x4f03f334 // sudot v20.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x10, #0x20]\n"
- ".inst 0x4f00f309 // sudot v9.4s, v24.16b, v0.4b[0]\n"
- ".inst 0x4f01f30d // sudot v13.4s, v24.16b, v1.4b[0]\n"
- ".inst 0x4f02f311 // sudot v17.4s, v24.16b, v2.4b[0]\n"
- ".inst 0x4f03f315 // sudot v21.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f00f32a // sudot v10.4s, v25.16b, v0.4b[0]\n"
- ".inst 0x4f01f32e // sudot v14.4s, v25.16b, v1.4b[0]\n"
- ".inst 0x4f02f332 // sudot v18.4s, v25.16b, v2.4b[0]\n"
- ".inst 0x4f03f336 // sudot v22.4s, v25.16b, v3.4b[0]\n"
- ".inst 0x4f00f30b // sudot v11.4s, v24.16b, v0.4b[0]\n"
- ".inst 0x4f01f30f // sudot v15.4s, v24.16b, v1.4b[0]\n"
- ".inst 0x4f02f313 // sudot v19.4s, v24.16b, v2.4b[0]\n"
- ".inst 0x4f03f317 // sudot v23.4s, v24.16b, v3.4b[0]\n"
- "126:" // Height 4: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 116b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "bge 135f\n"
- "tbz x11, #3, 130f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v13.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v17.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
- "tbz x11, #2, 128f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v18.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
- "tbz x11, #1, 127f\n"
- "str d11, [x9], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "str d23, [x22], #0x8\n"
- "tbz x11, #0, 134f\n"
- "st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "st1 { v23.s }[2], [x22]\n"
- "b 134f\n"
- "127:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x11, #0, 134f\n"
- "str s11, [x9, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "str s23, [x22, #0x0]\n"
- "b 134f\n"
- "128:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x11, #1, 129f\n"
- "str d10, [x9], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d18, [x23], #0x8\n"
- "str d22, [x22], #0x8\n"
- "tbz x11, #0, 134f\n"
- "st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v18.s }[2], [x23]\n"
- "st1 { v22.s }[2], [x22]\n"
- "b 134f\n"
- "129:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x11, #0, 134f\n"
- "str s10, [x9, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s18, [x23, #0x0]\n"
- "str s22, [x22, #0x0]\n"
- "b 134f\n"
- "130:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x11, #2, 132f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "tbz x11, #1, 131f\n"
- "str d9, [x9], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d17, [x23], #0x8\n"
- "str d21, [x22], #0x8\n"
- "tbz x11, #0, 134f\n"
- "st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v17.s }[2], [x23]\n"
- "st1 { v21.s }[2], [x22]\n"
- "b 134f\n"
- "131:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x11, #0, 134f\n"
- "str s9, [x9, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s17, [x23, #0x0]\n"
- "str s21, [x22, #0x0]\n"
- "b 134f\n"
- "132:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x11, #1, 133f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "tbz x11, #0, 134f\n"
- "st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v16.s }[2], [x23]\n"
- "st1 { v20.s }[2], [x22]\n"
- "b 134f\n"
- "133:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s16, [x23, #0x0]\n"
- "str s20, [x22, #0x0]\n"
- "134:" // Height 4: Partial direct writeback: Done
- "b 136f\n"
- "135:" // Height 4: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q12, [x24, #0x0]\n"
- "str q13, [x24, #0x10]\n"
- "str q14, [x24, #0x20]\n"
- "str q15, [x24, #0x30]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q18, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "str q20, [x22, #0x0]\n"
- "str q21, [x22, #0x10]\n"
- "str q22, [x22, #0x20]\n"
- "str q23, [x22, #0x30]\n"
- "136:" // Height 4: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 104b\n"
- "b 206f\n"
- "137:" // Height 5
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "138:" // Height 5: Column loop
- "tbz %x[flags], #0, 148f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "bge 147f\n"
- "tbz x11, #3, 142f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v24.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v21.4s }, [x22], #0x10\n"
- "ld1 { v25.4s }, [x21], #0x10\n"
- "tbz x11, #2, 140f\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "ld1 { v22.4s }, [x22], #0x10\n"
- "ld1 { v26.4s }, [x21], #0x10\n"
- "tbz x11, #1, 139f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d19, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d27, [x21], #0x8\n"
- "tbz x11, #0, 146f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "ld1 { v27.s }[2], [x21]\n"
- "b 146f\n"
- "139:" // Height 5: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 146f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "ldr s27, [x21, #0x0]\n"
- "b 146f\n"
- "140:" // Height 5: Partial accumulate: partial_2_8
- "tbz x11, #1, 141f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
- "tbz x11, #0, 146f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "ld1 { v22.s }[2], [x22]\n"
- "ld1 { v26.s }[2], [x21]\n"
- "b 146f\n"
- "141:" // Height 5: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 146f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "ldr s22, [x22, #0x0]\n"
- "ldr s26, [x21, #0x0]\n"
- "b 146f\n"
- "142:" // Height 5: Partial accumulate: partial_4_0
- "tbz x11, #2, 144f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v24.4s }, [x21], #0x10\n"
- "tbz x11, #1, 143f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d25, [x21], #0x8\n"
- "tbz x11, #0, 146f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "ld1 { v25.s }[2], [x21]\n"
- "b 146f\n"
- "143:" // Height 5: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 146f\n"
- "ldr s9, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "ldr s25, [x21, #0x0]\n"
- "b 146f\n"
- "144:" // Height 5: Partial accumulate: partial_2_0
- "tbz x11, #1, 145f\n"
- "ldr d8, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d16, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d24, [x21], #0x8\n"
- "tbz x11, #0, 146f\n"
- "ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v16.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "ld1 { v24.s }[2], [x21]\n"
- "b 146f\n"
- "145:" // Height 5: Partial accumulate: partial_1_0
- "ldr s8, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s16, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "ldr s24, [x21, #0x0]\n"
- "146:" // Height 5: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 149f\n"
- "147:" // Height 5: full accumulate
- "ldr q8, [x9, #0x0]\n"
- "ldr q9, [x9, #0x10]\n"
- "ldr q10, [x9, #0x20]\n"
- "ldr q11, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q16, [x23, #0x0]\n"
- "ldr q17, [x23, #0x10]\n"
- "ldr q18, [x23, #0x20]\n"
- "ldr q19, [x23, #0x30]\n"
- "ldr q20, [x22, #0x0]\n"
- "ldr q21, [x22, #0x10]\n"
- "ldr q22, [x22, #0x20]\n"
- "ldr q23, [x22, #0x30]\n"
- "ldr q24, [x21, #0x0]\n"
- "ldr q25, [x21, #0x10]\n"
- "ldr q26, [x21, #0x20]\n"
- "ldr q27, [x21, #0x30]\n"
- "b 149f\n"
- "148:" // Height 5: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "149:" // Height 5: setup done
- "mov x28, #0x0\n"
- "150:" // Height 5: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 151f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x28, 152f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 152f\n"
- "151:" // Height 5: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "152:" // Height 5: input setup done
- "cmp x27, #0x10\n"
- "blt 155f\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q2, [x24, #0x0]\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "blt 154f\n"
- "153:" // Height 5: Multiply loop: Main loop head
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d4 // sudot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f04f0d8 // sudot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- "add x22, x22, #0x10\n"
- "cmp x27, #0x20\n"
- ".inst 0x4f03f0f5 // sudot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f04f0f9 // sudot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f00f3aa // sudot v10.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f01f3ae // sudot v14.4s, v29.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f02f3b2 // sudot v18.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f03f3b6 // sudot v22.4s, v29.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f04f3ba // sudot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x40]\n"
- ".inst 0x4f00f38b // sudot v11.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f01f38f // sudot v15.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f02f393 // sudot v19.4s, v28.16b, v2.4b[0]\n"
- ".inst 0x4f03f397 // sudot v23.4s, v28.16b, v3.4b[0]\n"
- ".inst 0x4f04f39b // sudot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x50]\n"
- ".inst 0x4f20f3a8 // sudot v8.4s, v29.16b, v0.4b[1]\n"
- ".inst 0x4f21f3ac // sudot v12.4s, v29.16b, v1.4b[1]\n"
- ".inst 0x4f22f3b0 // sudot v16.4s, v29.16b, v2.4b[1]\n"
- ".inst 0x4f23f3b4 // sudot v20.4s, v29.16b, v3.4b[1]\n"
- ".inst 0x4f24f3b8 // sudot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x60]\n"
- ".inst 0x4f20f389 // sudot v9.4s, v28.16b, v0.4b[1]\n"
- ".inst 0x4f21f38d // sudot v13.4s, v28.16b, v1.4b[1]\n"
- ".inst 0x4f22f391 // sudot v17.4s, v28.16b, v2.4b[1]\n"
- ".inst 0x4f23f395 // sudot v21.4s, v28.16b, v3.4b[1]\n"
- ".inst 0x4f24f399 // sudot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x70]\n"
- ".inst 0x4f20f3aa // sudot v10.4s, v29.16b, v0.4b[1]\n"
- ".inst 0x4f21f3ae // sudot v14.4s, v29.16b, v1.4b[1]\n"
- ".inst 0x4f22f3b2 // sudot v18.4s, v29.16b, v2.4b[1]\n"
- ".inst 0x4f23f3b6 // sudot v22.4s, v29.16b, v3.4b[1]\n"
- ".inst 0x4f24f3ba // sudot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x80]\n"
- ".inst 0x4f20f38b // sudot v11.4s, v28.16b, v0.4b[1]\n"
- ".inst 0x4f21f38f // sudot v15.4s, v28.16b, v1.4b[1]\n"
- ".inst 0x4f22f393 // sudot v19.4s, v28.16b, v2.4b[1]\n"
- ".inst 0x4f23f397 // sudot v23.4s, v28.16b, v3.4b[1]\n"
- ".inst 0x4f24f39b // sudot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x90]\n"
- ".inst 0x4f00fba8 // sudot v8.4s, v29.16b, v0.4b[2]\n"
- ".inst 0x4f01fbac // sudot v12.4s, v29.16b, v1.4b[2]\n"
- ".inst 0x4f02fbb0 // sudot v16.4s, v29.16b, v2.4b[2]\n"
- ".inst 0x4f03fbb4 // sudot v20.4s, v29.16b, v3.4b[2]\n"
- ".inst 0x4f04fbb8 // sudot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xa0]\n"
- ".inst 0x4f00fb89 // sudot v9.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb8d // sudot v13.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f02fb91 // sudot v17.4s, v28.16b, v2.4b[2]\n"
- ".inst 0x4f03fb95 // sudot v21.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x4f04fb99 // sudot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xb0]\n"
- ".inst 0x4f00fbaa // sudot v10.4s, v29.16b, v0.4b[2]\n"
- ".inst 0x4f01fbae // sudot v14.4s, v29.16b, v1.4b[2]\n"
- ".inst 0x4f02fbb2 // sudot v18.4s, v29.16b, v2.4b[2]\n"
- ".inst 0x4f03fbb6 // sudot v22.4s, v29.16b, v3.4b[2]\n"
- ".inst 0x4f04fbba // sudot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xc0]\n"
- ".inst 0x4f00fb8b // sudot v11.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb8f // sudot v15.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f02fb93 // sudot v19.4s, v28.16b, v2.4b[2]\n"
- ".inst 0x4f03fb97 // sudot v23.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x4f04fb9b // sudot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xd0]\n"
- ".inst 0x4f20fba8 // sudot v8.4s, v29.16b, v0.4b[3]\n"
- ".inst 0x4f21fbac // sudot v12.4s, v29.16b, v1.4b[3]\n"
- ".inst 0x4f22fbb0 // sudot v16.4s, v29.16b, v2.4b[3]\n"
- ".inst 0x4f23fbb4 // sudot v20.4s, v29.16b, v3.4b[3]\n"
- ".inst 0x4f24fbb8 // sudot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x10, #0xe0]\n"
- ".inst 0x4f20fb89 // sudot v9.4s, v28.16b, v0.4b[3]\n"
- ".inst 0x4f21fb8d // sudot v13.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x4f22fb91 // sudot v17.4s, v28.16b, v2.4b[3]\n"
- ".inst 0x4f23fb95 // sudot v21.4s, v28.16b, v3.4b[3]\n"
- ".inst 0x4f24fb99 // sudot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fbaa // sudot v10.4s, v29.16b, v0.4b[3]\n"
- ".inst 0x4f21fbae // sudot v14.4s, v29.16b, v1.4b[3]\n"
- ".inst 0x4f22fbb2 // sudot v18.4s, v29.16b, v2.4b[3]\n"
- ".inst 0x4f23fbb6 // sudot v22.4s, v29.16b, v3.4b[3]\n"
- ".inst 0x4f24fbba // sudot v26.4s, v29.16b, v4.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f20fb8b // sudot v11.4s, v28.16b, v0.4b[3]\n"
- "ldr q0, [x26, #0x0]\n"
- ".inst 0x4f21fb8f // sudot v15.4s, v28.16b, v1.4b[3]\n"
- "ldr q1, [x25, #0x0]\n"
- ".inst 0x4f22fb93 // sudot v19.4s, v28.16b, v2.4b[3]\n"
- "ldr q2, [x24, #0x0]\n"
- ".inst 0x4f23fb97 // sudot v23.4s, v28.16b, v3.4b[3]\n"
- "ldr q3, [x23, #0x0]\n"
- ".inst 0x4f24fb9b // sudot v27.4s, v28.16b, v4.4b[3]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "bge 153b\n"
- "154:" // Height 5: Multiply loop: Single iteration only
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d4 // sudot v20.4s, v6.16b, v3.4b[0]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f04f0d8 // sudot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f03f0f5 // sudot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f04f0f9 // sudot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
- ".inst 0x4f00f3aa // sudot v10.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f01f3ae // sudot v14.4s, v29.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f02f3b2 // sudot v18.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f03f3b6 // sudot v22.4s, v29.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f04f3ba // sudot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x40]\n"
- ".inst 0x4f00f38b // sudot v11.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f01f38f // sudot v15.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f02f393 // sudot v19.4s, v28.16b, v2.4b[0]\n"
- ".inst 0x4f03f397 // sudot v23.4s, v28.16b, v3.4b[0]\n"
- ".inst 0x4f04f39b // sudot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x50]\n"
- ".inst 0x4f20f3a8 // sudot v8.4s, v29.16b, v0.4b[1]\n"
- ".inst 0x4f21f3ac // sudot v12.4s, v29.16b, v1.4b[1]\n"
- ".inst 0x4f22f3b0 // sudot v16.4s, v29.16b, v2.4b[1]\n"
- ".inst 0x4f23f3b4 // sudot v20.4s, v29.16b, v3.4b[1]\n"
- ".inst 0x4f24f3b8 // sudot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x60]\n"
- ".inst 0x4f20f389 // sudot v9.4s, v28.16b, v0.4b[1]\n"
- ".inst 0x4f21f38d // sudot v13.4s, v28.16b, v1.4b[1]\n"
- ".inst 0x4f22f391 // sudot v17.4s, v28.16b, v2.4b[1]\n"
- ".inst 0x4f23f395 // sudot v21.4s, v28.16b, v3.4b[1]\n"
- ".inst 0x4f24f399 // sudot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x70]\n"
- ".inst 0x4f20f3aa // sudot v10.4s, v29.16b, v0.4b[1]\n"
- ".inst 0x4f21f3ae // sudot v14.4s, v29.16b, v1.4b[1]\n"
- ".inst 0x4f22f3b2 // sudot v18.4s, v29.16b, v2.4b[1]\n"
- ".inst 0x4f23f3b6 // sudot v22.4s, v29.16b, v3.4b[1]\n"
- ".inst 0x4f24f3ba // sudot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x10, #0x80]\n"
- ".inst 0x4f20f38b // sudot v11.4s, v28.16b, v0.4b[1]\n"
- ".inst 0x4f21f38f // sudot v15.4s, v28.16b, v1.4b[1]\n"
- ".inst 0x4f22f393 // sudot v19.4s, v28.16b, v2.4b[1]\n"
- ".inst 0x4f23f397 // sudot v23.4s, v28.16b, v3.4b[1]\n"
- ".inst 0x4f24f39b // sudot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x10, #0x90]\n"
- ".inst 0x4f00fba8 // sudot v8.4s, v29.16b, v0.4b[2]\n"
- ".inst 0x4f01fbac // sudot v12.4s, v29.16b, v1.4b[2]\n"
- ".inst 0x4f02fbb0 // sudot v16.4s, v29.16b, v2.4b[2]\n"
- ".inst 0x4f03fbb4 // sudot v20.4s, v29.16b, v3.4b[2]\n"
- ".inst 0x4f04fbb8 // sudot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xa0]\n"
- ".inst 0x4f00fb89 // sudot v9.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb8d // sudot v13.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f02fb91 // sudot v17.4s, v28.16b, v2.4b[2]\n"
- ".inst 0x4f03fb95 // sudot v21.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x4f04fb99 // sudot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xb0]\n"
- ".inst 0x4f00fbaa // sudot v10.4s, v29.16b, v0.4b[2]\n"
- ".inst 0x4f01fbae // sudot v14.4s, v29.16b, v1.4b[2]\n"
- ".inst 0x4f02fbb2 // sudot v18.4s, v29.16b, v2.4b[2]\n"
- ".inst 0x4f03fbb6 // sudot v22.4s, v29.16b, v3.4b[2]\n"
- ".inst 0x4f04fbba // sudot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x10, #0xc0]\n"
- ".inst 0x4f00fb8b // sudot v11.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4f01fb8f // sudot v15.4s, v28.16b, v1.4b[2]\n"
- ".inst 0x4f02fb93 // sudot v19.4s, v28.16b, v2.4b[2]\n"
- ".inst 0x4f03fb97 // sudot v23.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x4f04fb9b // sudot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x10, #0xd0]\n"
- ".inst 0x4f20fba8 // sudot v8.4s, v29.16b, v0.4b[3]\n"
- ".inst 0x4f21fbac // sudot v12.4s, v29.16b, v1.4b[3]\n"
- ".inst 0x4f22fbb0 // sudot v16.4s, v29.16b, v2.4b[3]\n"
- ".inst 0x4f23fbb4 // sudot v20.4s, v29.16b, v3.4b[3]\n"
- ".inst 0x4f24fbb8 // sudot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x10, #0xe0]\n"
- ".inst 0x4f20fb89 // sudot v9.4s, v28.16b, v0.4b[3]\n"
- ".inst 0x4f21fb8d // sudot v13.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x4f22fb91 // sudot v17.4s, v28.16b, v2.4b[3]\n"
- ".inst 0x4f23fb95 // sudot v21.4s, v28.16b, v3.4b[3]\n"
- ".inst 0x4f24fb99 // sudot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20fbaa // sudot v10.4s, v29.16b, v0.4b[3]\n"
- ".inst 0x4f21fbae // sudot v14.4s, v29.16b, v1.4b[3]\n"
- ".inst 0x4f22fbb2 // sudot v18.4s, v29.16b, v2.4b[3]\n"
- ".inst 0x4f23fbb6 // sudot v22.4s, v29.16b, v3.4b[3]\n"
- ".inst 0x4f24fbba // sudot v26.4s, v29.16b, v4.4b[3]\n"
- ".inst 0x4f20fb8b // sudot v11.4s, v28.16b, v0.4b[3]\n"
- ".inst 0x4f21fb8f // sudot v15.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x4f22fb93 // sudot v19.4s, v28.16b, v2.4b[3]\n"
- ".inst 0x4f23fb97 // sudot v23.4s, v28.16b, v3.4b[3]\n"
- ".inst 0x4f24fb9b // sudot v27.4s, v28.16b, v4.4b[3]\n"
- "155:" // Height 5: Multiply loop: Main loop skip
- "cbz x27, 160f\n"
- "cmp x27, #0x4\n"
- "blt 157f\n"
- "156:" // Height 5: Multiply loop: Odd block loop
- "ldr s2, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
- "sub x27, x27, #0x4\n"
- "ldr s0, [x24], #0x4\n"
- "ldr s31, [x23], #0x4\n"
- "cmp x27, #0x4\n"
- "ldr s30, [x22], #0x4\n"
- "ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
- ".inst 0x4f02f3a8 // sudot v8.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f01f3ac // sudot v12.4s, v29.16b, v1.4b[0]\n"
- ".inst 0x4f00f3b0 // sudot v16.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f1ff3b4 // sudot v20.4s, v29.16b, v31.4b[0]\n"
- ".inst 0x4f1ef3b8 // sudot v24.4s, v29.16b, v30.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
- ".inst 0x4f02f389 // sudot v9.4s, v28.16b, v2.4b[0]\n"
- ".inst 0x4f01f38d // sudot v13.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f00f391 // sudot v17.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f1ff395 // sudot v21.4s, v28.16b, v31.4b[0]\n"
- ".inst 0x4f1ef399 // sudot v25.4s, v28.16b, v30.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f02f3aa // sudot v10.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f01f3ae // sudot v14.4s, v29.16b, v1.4b[0]\n"
- ".inst 0x4f00f3b2 // sudot v18.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f1ff3b6 // sudot v22.4s, v29.16b, v31.4b[0]\n"
- ".inst 0x4f1ef3ba // sudot v26.4s, v29.16b, v30.4b[0]\n"
- ".inst 0x4f02f38b // sudot v11.4s, v28.16b, v2.4b[0]\n"
- ".inst 0x4f01f38f // sudot v15.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f00f393 // sudot v19.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f1ff397 // sudot v23.4s, v28.16b, v31.4b[0]\n"
- ".inst 0x4f1ef39b // sudot v27.4s, v28.16b, v30.4b[0]\n"
- "bge 156b\n"
- "157:" // Height 5: Multiply loop: Skip odd blocks
- "cbz x27, 160f\n"
- "tbz x27, #1, 158f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "tbz x27, #0, 159f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "ld1 { v3.b }[2], [x23]\n"
- "ld1 { v4.b }[2], [x22]\n"
- "b 159f\n"
- "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x25, #0x0]\n"
- "ldr b2, [x24, #0x0]\n"
- "ldr b3, [x23, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
- "159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
- ".inst 0x4f00f3a8 // sudot v8.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f01f3ac // sudot v12.4s, v29.16b, v1.4b[0]\n"
- ".inst 0x4f02f3b0 // sudot v16.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f03f3b4 // sudot v20.4s, v29.16b, v3.4b[0]\n"
- ".inst 0x4f04f3b8 // sudot v24.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x10, #0x20]\n"
- ".inst 0x4f00f389 // sudot v9.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f01f38d // sudot v13.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f02f391 // sudot v17.4s, v28.16b, v2.4b[0]\n"
- ".inst 0x4f03f395 // sudot v21.4s, v28.16b, v3.4b[0]\n"
- ".inst 0x4f04f399 // sudot v25.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f00f3aa // sudot v10.4s, v29.16b, v0.4b[0]\n"
- ".inst 0x4f01f3ae // sudot v14.4s, v29.16b, v1.4b[0]\n"
- ".inst 0x4f02f3b2 // sudot v18.4s, v29.16b, v2.4b[0]\n"
- ".inst 0x4f03f3b6 // sudot v22.4s, v29.16b, v3.4b[0]\n"
- ".inst 0x4f04f3ba // sudot v26.4s, v29.16b, v4.4b[0]\n"
- ".inst 0x4f00f38b // sudot v11.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4f01f38f // sudot v15.4s, v28.16b, v1.4b[0]\n"
- ".inst 0x4f02f393 // sudot v19.4s, v28.16b, v2.4b[0]\n"
- ".inst 0x4f03f397 // sudot v23.4s, v28.16b, v3.4b[0]\n"
- ".inst 0x4f04f39b // sudot v27.4s, v28.16b, v4.4b[0]\n"
- "160:" // Height 5: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 150b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "bge 169f\n"
- "tbz x11, #3, 164f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v13.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v17.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
- "st1 { v24.4s }, [x21], #0x10\n"
- "st1 { v25.4s }, [x21], #0x10\n"
- "tbz x11, #2, 162f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v18.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
- "st1 { v26.4s }, [x21], #0x10\n"
- "tbz x11, #1, 161f\n"
- "str d11, [x9], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "str d23, [x22], #0x8\n"
- "str d27, [x21], #0x8\n"
- "tbz x11, #0, 168f\n"
- "st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "st1 { v23.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x21]\n"
- "b 168f\n"
- "161:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x11, #0, 168f\n"
- "str s11, [x9, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "str s23, [x22, #0x0]\n"
- "str s27, [x21, #0x0]\n"
- "b 168f\n"
- "162:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x11, #1, 163f\n"
- "str d10, [x9], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d18, [x23], #0x8\n"
- "str d22, [x22], #0x8\n"
- "str d26, [x21], #0x8\n"
- "tbz x11, #0, 168f\n"
- "st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v18.s }[2], [x23]\n"
- "st1 { v22.s }[2], [x22]\n"
- "st1 { v26.s }[2], [x21]\n"
- "b 168f\n"
- "163:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x11, #0, 168f\n"
- "str s10, [x9, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s18, [x23, #0x0]\n"
- "str s22, [x22, #0x0]\n"
- "str s26, [x21, #0x0]\n"
- "b 168f\n"
- "164:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x11, #2, 166f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v24.4s }, [x21], #0x10\n"
- "tbz x11, #1, 165f\n"
- "str d9, [x9], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d17, [x23], #0x8\n"
- "str d21, [x22], #0x8\n"
- "str d25, [x21], #0x8\n"
- "tbz x11, #0, 168f\n"
- "st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v17.s }[2], [x23]\n"
- "st1 { v21.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "b 168f\n"
- "165:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x11, #0, 168f\n"
- "str s9, [x9, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s17, [x23, #0x0]\n"
- "str s21, [x22, #0x0]\n"
- "str s25, [x21, #0x0]\n"
- "b 168f\n"
- "166:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x11, #1, 167f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "tbz x11, #0, 168f\n"
- "st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v16.s }[2], [x23]\n"
- "st1 { v20.s }[2], [x22]\n"
- "st1 { v24.s }[2], [x21]\n"
- "b 168f\n"
- "167:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s16, [x23, #0x0]\n"
- "str s20, [x22, #0x0]\n"
- "str s24, [x21, #0x0]\n"
- "168:" // Height 5: Partial direct writeback: Done
- "b 170f\n"
- "169:" // Height 5: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q12, [x24, #0x0]\n"
- "str q13, [x24, #0x10]\n"
- "str q14, [x24, #0x20]\n"
- "str q15, [x24, #0x30]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q18, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "str q20, [x22, #0x0]\n"
- "str q21, [x22, #0x10]\n"
- "str q22, [x22, #0x20]\n"
- "str q23, [x22, #0x30]\n"
- "str q24, [x21, #0x0]\n"
- "str q25, [x21, #0x10]\n"
- "str q26, [x21, #0x20]\n"
- "str q27, [x21, #0x30]\n"
- "170:" // Height 5: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 138b\n"
- "b 206f\n"
- "171:" // Height 6
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x18\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "172:" // Height 6: Column loop
- "tbz %x[flags], #0, 182f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "bge 181f\n"
- "tbz x11, #3, 176f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v24.4s }, [x21], #0x10\n"
- "ld1 { v28.4s }, [x20], #0x10\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v21.4s }, [x22], #0x10\n"
- "ld1 { v25.4s }, [x21], #0x10\n"
- "ld1 { v29.4s }, [x20], #0x10\n"
- "tbz x11, #2, 174f\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "ld1 { v22.4s }, [x22], #0x10\n"
- "ld1 { v26.4s }, [x21], #0x10\n"
- "ld1 { v30.4s }, [x20], #0x10\n"
- "tbz x11, #1, 173f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d19, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d27, [x21], #0x8\n"
- "ldr d31, [x20], #0x8\n"
- "tbz x11, #0, 180f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "ld1 { v27.s }[2], [x21]\n"
- "ld1 { v31.s }[2], [x20]\n"
- "b 180f\n"
- "173:" // Height 6: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 180f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "ldr s27, [x21, #0x0]\n"
- "ldr s31, [x20, #0x0]\n"
- "b 180f\n"
- "174:" // Height 6: Partial accumulate: partial_2_8
- "tbz x11, #1, 175f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
- "ldr d30, [x20], #0x8\n"
- "tbz x11, #0, 180f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "ld1 { v22.s }[2], [x22]\n"
- "ld1 { v26.s }[2], [x21]\n"
- "ld1 { v30.s }[2], [x20]\n"
- "b 180f\n"
- "175:" // Height 6: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 180f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "ldr s22, [x22, #0x0]\n"
- "ldr s26, [x21, #0x0]\n"
- "ldr s30, [x20, #0x0]\n"
- "b 180f\n"
- "176:" // Height 6: Partial accumulate: partial_4_0
- "tbz x11, #2, 178f\n"
- "ld1 { v8.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v24.4s }, [x21], #0x10\n"
- "ld1 { v28.4s }, [x20], #0x10\n"
- "tbz x11, #1, 177f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d25, [x21], #0x8\n"
- "ldr d29, [x20], #0x8\n"
- "tbz x11, #0, 180f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "ld1 { v25.s }[2], [x21]\n"
- "ld1 { v29.s }[2], [x20]\n"
- "b 180f\n"
- "177:" // Height 6: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 180f\n"
- "ldr s9, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "ldr s25, [x21, #0x0]\n"
- "ldr s29, [x20, #0x0]\n"
- "b 180f\n"
- "178:" // Height 6: Partial accumulate: partial_2_0
- "tbz x11, #1, 179f\n"
- "ldr d8, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d16, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d24, [x21], #0x8\n"
- "ldr d28, [x20], #0x8\n"
- "tbz x11, #0, 180f\n"
- "ld1 { v8.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v16.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "ld1 { v24.s }[2], [x21]\n"
- "ld1 { v28.s }[2], [x20]\n"
- "b 180f\n"
- "179:" // Height 6: Partial accumulate: partial_1_0
- "ldr s8, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s16, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "ldr s24, [x21, #0x0]\n"
- "ldr s28, [x20, #0x0]\n"
- "180:" // Height 6: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 183f\n"
- "181:" // Height 6: full accumulate
- "ldr q8, [x9, #0x0]\n"
- "ldr q9, [x9, #0x10]\n"
- "ldr q10, [x9, #0x20]\n"
- "ldr q11, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q16, [x23, #0x0]\n"
- "ldr q17, [x23, #0x10]\n"
- "ldr q18, [x23, #0x20]\n"
- "ldr q19, [x23, #0x30]\n"
- "ldr q20, [x22, #0x0]\n"
- "ldr q21, [x22, #0x10]\n"
- "ldr q22, [x22, #0x20]\n"
- "ldr q23, [x22, #0x30]\n"
- "ldr q24, [x21, #0x0]\n"
- "ldr q25, [x21, #0x10]\n"
- "ldr q26, [x21, #0x20]\n"
- "ldr q27, [x21, #0x30]\n"
- "ldr q28, [x20, #0x0]\n"
- "ldr q29, [x20, #0x10]\n"
- "ldr q30, [x20, #0x20]\n"
- "ldr q31, [x20, #0x30]\n"
- "b 183f\n"
- "182:" // Height 6: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "183:" // Height 6: setup done
- "mov x28, #0x0\n"
- "184:" // Height 6: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 185f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "ldr x21, [x20, #0x28]\n"
- "cbnz x28, 186f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 186f\n"
- "185:" // Height 6: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "186:" // Height 6: input setup done
- "cmp x27, #0x10\n"
- "blt 189f\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q2, [x24, #0x0]\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q5, [x21, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "blt 188f\n"
- "187:" // Height 6: Multiply loop: Main loop head
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d4 // sudot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4f04f0d8 // sudot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f05f0dc // sudot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f5 // sudot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f04f0f9 // sudot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f05f0fd // sudot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f00f0ca // sudot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ce // sudot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f02f0d2 // sudot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d6 // sudot v22.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- ".inst 0x4f04f0da // sudot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f05f0de // sudot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f00f0eb // sudot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ef // sudot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f3 // sudot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f7 // sudot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f04f0fb // sudot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f05f0ff // sudot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f20f0c8 // sudot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f21f0cc // sudot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f22f0d0 // sudot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4f23f0d4 // sudot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4f24f0d8 // sudot v24.4s, v6.16b, v4.4b[1]\n"
- ".inst 0x4f25f0dc // sudot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f20f0e9 // sudot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4f21f0ed // sudot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f22f0f1 // sudot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4f23f0f5 // sudot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4f24f0f9 // sudot v25.4s, v7.16b, v4.4b[1]\n"
- ".inst 0x4f25f0fd // sudot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f20f0ca // sudot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f21f0ce // sudot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f22f0d2 // sudot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4f23f0d6 // sudot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4f24f0da // sudot v26.4s, v6.16b, v4.4b[1]\n"
- ".inst 0x4f25f0de // sudot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f20f0eb // sudot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4f21f0ef // sudot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f22f0f3 // sudot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4f23f0f7 // sudot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4f24f0fb // sudot v27.4s, v7.16b, v4.4b[1]\n"
- ".inst 0x4f25f0ff // sudot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f00f8c8 // sudot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f01f8cc // sudot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f02f8d0 // sudot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f03f8d4 // sudot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f04f8d8 // sudot v24.4s, v6.16b, v4.4b[2]\n"
- ".inst 0x4f05f8dc // sudot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f00f8e9 // sudot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f01f8ed // sudot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f02f8f1 // sudot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f03f8f5 // sudot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f04f8f9 // sudot v25.4s, v7.16b, v4.4b[2]\n"
- ".inst 0x4f05f8fd // sudot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f00f8ca // sudot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f01f8ce // sudot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f02f8d2 // sudot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f03f8d6 // sudot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f04f8da // sudot v26.4s, v6.16b, v4.4b[2]\n"
- ".inst 0x4f05f8de // sudot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f00f8eb // sudot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f01f8ef // sudot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f02f8f3 // sudot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f03f8f7 // sudot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f04f8fb // sudot v27.4s, v7.16b, v4.4b[2]\n"
- ".inst 0x4f05f8ff // sudot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f20f8c8 // sudot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f21f8cc // sudot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4f22f8d0 // sudot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4f23f8d4 // sudot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4f24f8d8 // sudot v24.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4f25f8dc // sudot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f20f8e9 // sudot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4f21f8ed // sudot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4f22f8f1 // sudot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4f23f8f5 // sudot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4f24f8f9 // sudot v25.4s, v7.16b, v4.4b[3]\n"
- ".inst 0x4f25f8fd // sudot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20f8ca // sudot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f21f8ce // sudot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4f22f8d2 // sudot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4f23f8d6 // sudot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4f24f8da // sudot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4f25f8de // sudot v30.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f20f8eb // sudot v11.4s, v7.16b, v0.4b[3]\n"
- "ldr q0, [x26, #0x0]\n"
- ".inst 0x4f21f8ef // sudot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr q1, [x25, #0x0]\n"
- ".inst 0x4f22f8f3 // sudot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr q2, [x24, #0x0]\n"
- ".inst 0x4f23f8f7 // sudot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr q3, [x23, #0x0]\n"
- ".inst 0x4f24f8fb // sudot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr q4, [x22, #0x0]\n"
- ".inst 0x4f25f8ff // sudot v31.4s, v7.16b, v5.4b[3]\n"
- "ldr q5, [x21, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "bge 187b\n"
- "188:" // Height 6: Multiply loop: Single iteration only
- ".inst 0x4f00f0c8 // sudot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cc // sudot v12.4s, v6.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f02f0d0 // sudot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d4 // sudot v20.4s, v6.16b, v3.4b[0]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f04f0d8 // sudot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f05f0dc // sudot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f00f0e9 // sudot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ed // sudot v13.4s, v7.16b, v1.4b[0]\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f02f0f1 // sudot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f5 // sudot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f04f0f9 // sudot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f05f0fd // sudot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f00f0ca // sudot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ce // sudot v14.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f02f0d2 // sudot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d6 // sudot v22.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- ".inst 0x4f04f0da // sudot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f05f0de // sudot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f00f0eb // sudot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ef // sudot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f3 // sudot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f7 // sudot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f04f0fb // sudot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f05f0ff // sudot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f20f0c8 // sudot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f21f0cc // sudot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f22f0d0 // sudot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4f23f0d4 // sudot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4f24f0d8 // sudot v24.4s, v6.16b, v4.4b[1]\n"
- ".inst 0x4f25f0dc // sudot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f20f0e9 // sudot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4f21f0ed // sudot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f22f0f1 // sudot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4f23f0f5 // sudot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4f24f0f9 // sudot v25.4s, v7.16b, v4.4b[1]\n"
- ".inst 0x4f25f0fd // sudot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f20f0ca // sudot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f21f0ce // sudot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f22f0d2 // sudot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4f23f0d6 // sudot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4f24f0da // sudot v26.4s, v6.16b, v4.4b[1]\n"
- ".inst 0x4f25f0de // sudot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f20f0eb // sudot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4f21f0ef // sudot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f22f0f3 // sudot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4f23f0f7 // sudot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4f24f0fb // sudot v27.4s, v7.16b, v4.4b[1]\n"
- ".inst 0x4f25f0ff // sudot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f00f8c8 // sudot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f01f8cc // sudot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f02f8d0 // sudot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f03f8d4 // sudot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f04f8d8 // sudot v24.4s, v6.16b, v4.4b[2]\n"
- ".inst 0x4f05f8dc // sudot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f00f8e9 // sudot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f01f8ed // sudot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f02f8f1 // sudot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f03f8f5 // sudot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f04f8f9 // sudot v25.4s, v7.16b, v4.4b[2]\n"
- ".inst 0x4f05f8fd // sudot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f00f8ca // sudot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f01f8ce // sudot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f02f8d2 // sudot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f03f8d6 // sudot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f04f8da // sudot v26.4s, v6.16b, v4.4b[2]\n"
- ".inst 0x4f05f8de // sudot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f00f8eb // sudot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f01f8ef // sudot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f02f8f3 // sudot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f03f8f7 // sudot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f04f8fb // sudot v27.4s, v7.16b, v4.4b[2]\n"
- ".inst 0x4f05f8ff // sudot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f20f8c8 // sudot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f21f8cc // sudot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4f22f8d0 // sudot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4f23f8d4 // sudot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4f24f8d8 // sudot v24.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4f25f8dc // sudot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f20f8e9 // sudot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4f21f8ed // sudot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4f22f8f1 // sudot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4f23f8f5 // sudot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4f24f8f9 // sudot v25.4s, v7.16b, v4.4b[3]\n"
- ".inst 0x4f25f8fd // sudot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4f20f8ca // sudot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f21f8ce // sudot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4f22f8d2 // sudot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4f23f8d6 // sudot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4f24f8da // sudot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4f25f8de // sudot v30.4s, v6.16b, v5.4b[3]\n"
- ".inst 0x4f20f8eb // sudot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4f21f8ef // sudot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4f22f8f3 // sudot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4f23f8f7 // sudot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4f24f8fb // sudot v27.4s, v7.16b, v4.4b[3]\n"
- ".inst 0x4f25f8ff // sudot v31.4s, v7.16b, v5.4b[3]\n"
- "189:" // Height 6: Multiply loop: Main loop skip
- "cbz x27, 194f\n"
- "cmp x27, #0x4\n"
- "blt 191f\n"
- "190:" // Height 6: Multiply loop: Odd block loop
- "ldr s7, [x26], #0x4\n"
- "ldr s6, [x25], #0x4\n"
- "sub x27, x27, #0x4\n"
- "ldr s5, [x24], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "cmp x27, #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s2, [x21], #0x4\n"
- "ldr q1, [x10, #0x0]\n"
- "ldr q0, [x10, #0x10]\n"
- ".inst 0x4f07f028 // sudot v8.4s, v1.16b, v7.4b[0]\n"
- ".inst 0x4f06f02c // sudot v12.4s, v1.16b, v6.4b[0]\n"
- ".inst 0x4f05f030 // sudot v16.4s, v1.16b, v5.4b[0]\n"
- ".inst 0x4f04f034 // sudot v20.4s, v1.16b, v4.4b[0]\n"
- ".inst 0x4f03f038 // sudot v24.4s, v1.16b, v3.4b[0]\n"
- ".inst 0x4f02f03c // sudot v28.4s, v1.16b, v2.4b[0]\n"
- "ldr q1, [x10, #0x20]\n"
- ".inst 0x4f07f009 // sudot v9.4s, v0.16b, v7.4b[0]\n"
- ".inst 0x4f06f00d // sudot v13.4s, v0.16b, v6.4b[0]\n"
- ".inst 0x4f05f011 // sudot v17.4s, v0.16b, v5.4b[0]\n"
- ".inst 0x4f04f015 // sudot v21.4s, v0.16b, v4.4b[0]\n"
- ".inst 0x4f03f019 // sudot v25.4s, v0.16b, v3.4b[0]\n"
- ".inst 0x4f02f01d // sudot v29.4s, v0.16b, v2.4b[0]\n"
- "ldr q0, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f07f02a // sudot v10.4s, v1.16b, v7.4b[0]\n"
- ".inst 0x4f06f02e // sudot v14.4s, v1.16b, v6.4b[0]\n"
- ".inst 0x4f05f032 // sudot v18.4s, v1.16b, v5.4b[0]\n"
- ".inst 0x4f04f036 // sudot v22.4s, v1.16b, v4.4b[0]\n"
- ".inst 0x4f03f03a // sudot v26.4s, v1.16b, v3.4b[0]\n"
- ".inst 0x4f02f03e // sudot v30.4s, v1.16b, v2.4b[0]\n"
- ".inst 0x4f07f00b // sudot v11.4s, v0.16b, v7.4b[0]\n"
- ".inst 0x4f06f00f // sudot v15.4s, v0.16b, v6.4b[0]\n"
- ".inst 0x4f05f013 // sudot v19.4s, v0.16b, v5.4b[0]\n"
- ".inst 0x4f04f017 // sudot v23.4s, v0.16b, v4.4b[0]\n"
- ".inst 0x4f03f01b // sudot v27.4s, v0.16b, v3.4b[0]\n"
- ".inst 0x4f02f01f // sudot v31.4s, v0.16b, v2.4b[0]\n"
- "bge 190b\n"
- "191:" // Height 6: Multiply loop: Skip odd blocks
- "cbz x27, 194f\n"
- "tbz x27, #1, 192f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "ldr h5, [x21], #0x2\n"
- "tbz x27, #0, 193f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "ld1 { v3.b }[2], [x23]\n"
- "ld1 { v4.b }[2], [x22]\n"
- "ld1 { v5.b }[2], [x21]\n"
- "b 193f\n"
- "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x25, #0x0]\n"
- "ldr b2, [x24, #0x0]\n"
- "ldr b3, [x23, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
- "ldr b5, [x21, #0x0]\n"
- "193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x4f00f0e8 // sudot v8.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ec // sudot v12.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f0 // sudot v16.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f4 // sudot v20.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f04f0f8 // sudot v24.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f05f0fc // sudot v28.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4f00f0c9 // sudot v9.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cd // sudot v13.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0d1 // sudot v17.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d5 // sudot v21.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f04f0d9 // sudot v25.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f05f0dd // sudot v29.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
- ".inst 0x4f00f0ea // sudot v10.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f01f0ee // sudot v14.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f02f0f2 // sudot v18.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f03f0f6 // sudot v22.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f04f0fa // sudot v26.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f05f0fe // sudot v30.4s, v7.16b, v5.4b[0]\n"
- ".inst 0x4f00f0cb // sudot v11.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f01f0cf // sudot v15.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f02f0d3 // sudot v19.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f03f0d7 // sudot v23.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f04f0db // sudot v27.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f05f0df // sudot v31.4s, v6.16b, v5.4b[0]\n"
- "194:" // Height 6: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 184b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
- "bge 203f\n"
- "tbz x11, #3, 198f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v13.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v17.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v21.4s }, [x22], #0x10\n"
- "st1 { v24.4s }, [x21], #0x10\n"
- "st1 { v25.4s }, [x21], #0x10\n"
- "st1 { v28.4s }, [x20], #0x10\n"
- "st1 { v29.4s }, [x20], #0x10\n"
- "tbz x11, #2, 196f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "st1 { v14.4s }, [x24], #0x10\n"
- "st1 { v18.4s }, [x23], #0x10\n"
- "st1 { v22.4s }, [x22], #0x10\n"
- "st1 { v26.4s }, [x21], #0x10\n"
- "st1 { v30.4s }, [x20], #0x10\n"
- "tbz x11, #1, 195f\n"
- "str d11, [x9], #0x8\n"
- "str d15, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "str d23, [x22], #0x8\n"
- "str d27, [x21], #0x8\n"
- "str d31, [x20], #0x8\n"
- "tbz x11, #0, 202f\n"
- "st1 { v11.s }[2], [x9]\n"
- "st1 { v15.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "st1 { v23.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x21]\n"
- "st1 { v31.s }[2], [x20]\n"
- "b 202f\n"
- "195:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x11, #0, 202f\n"
- "str s11, [x9, #0x0]\n"
- "str s15, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "str s23, [x22, #0x0]\n"
- "str s27, [x21, #0x0]\n"
- "str s31, [x20, #0x0]\n"
- "b 202f\n"
- "196:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x11, #1, 197f\n"
- "str d10, [x9], #0x8\n"
- "str d14, [x24], #0x8\n"
- "str d18, [x23], #0x8\n"
- "str d22, [x22], #0x8\n"
- "str d26, [x21], #0x8\n"
- "str d30, [x20], #0x8\n"
- "tbz x11, #0, 202f\n"
- "st1 { v10.s }[2], [x9]\n"
- "st1 { v14.s }[2], [x24]\n"
- "st1 { v18.s }[2], [x23]\n"
- "st1 { v22.s }[2], [x22]\n"
- "st1 { v26.s }[2], [x21]\n"
- "st1 { v30.s }[2], [x20]\n"
- "b 202f\n"
- "197:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x11, #0, 202f\n"
- "str s10, [x9, #0x0]\n"
- "str s14, [x24, #0x0]\n"
- "str s18, [x23, #0x0]\n"
- "str s22, [x22, #0x0]\n"
- "str s26, [x21, #0x0]\n"
- "str s30, [x20, #0x0]\n"
- "b 202f\n"
- "198:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x11, #2, 200f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x22], #0x10\n"
- "st1 { v24.4s }, [x21], #0x10\n"
- "st1 { v28.4s }, [x20], #0x10\n"
- "tbz x11, #1, 199f\n"
- "str d9, [x9], #0x8\n"
- "str d13, [x24], #0x8\n"
- "str d17, [x23], #0x8\n"
- "str d21, [x22], #0x8\n"
- "str d25, [x21], #0x8\n"
- "str d29, [x20], #0x8\n"
- "tbz x11, #0, 202f\n"
- "st1 { v9.s }[2], [x9]\n"
- "st1 { v13.s }[2], [x24]\n"
- "st1 { v17.s }[2], [x23]\n"
- "st1 { v21.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "st1 { v29.s }[2], [x20]\n"
- "b 202f\n"
- "199:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x11, #0, 202f\n"
- "str s9, [x9, #0x0]\n"
- "str s13, [x24, #0x0]\n"
- "str s17, [x23, #0x0]\n"
- "str s21, [x22, #0x0]\n"
- "str s25, [x21, #0x0]\n"
- "str s29, [x20, #0x0]\n"
- "b 202f\n"
- "200:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x11, #1, 201f\n"
- "str d8, [x9], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
- "tbz x11, #0, 202f\n"
- "st1 { v8.s }[2], [x9]\n"
- "st1 { v12.s }[2], [x24]\n"
- "st1 { v16.s }[2], [x23]\n"
- "st1 { v20.s }[2], [x22]\n"
- "st1 { v24.s }[2], [x21]\n"
- "st1 { v28.s }[2], [x20]\n"
- "b 202f\n"
- "201:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "str s12, [x24, #0x0]\n"
- "str s16, [x23, #0x0]\n"
- "str s20, [x22, #0x0]\n"
- "str s24, [x21, #0x0]\n"
- "str s28, [x20, #0x0]\n"
- "202:" // Height 6: Partial direct writeback: Done
- "b 204f\n"
- "203:" // Height 6: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q12, [x24, #0x0]\n"
- "str q13, [x24, #0x10]\n"
- "str q14, [x24, #0x20]\n"
- "str q15, [x24, #0x30]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q18, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "str q20, [x22, #0x0]\n"
- "str q21, [x22, #0x10]\n"
- "str q22, [x22, #0x20]\n"
- "str q23, [x22, #0x30]\n"
- "str q24, [x21, #0x0]\n"
- "str q25, [x21, #0x10]\n"
- "str q26, [x21, #0x20]\n"
- "str q27, [x21, #0x30]\n"
- "str q28, [x20, #0x0]\n"
- "str q29, [x20, #0x10]\n"
- "str q30, [x20, #0x20]\n"
- "str q31, [x20, #0x30]\n"
- "204:" // Height 6: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 172b\n"
- "subs %x[M], %x[M], #0x6\n"
- "beq 206f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 205f\n"
- "add x21, x21, #0x6\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "205:" // Update direct input
- "mov x20, #0x6\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "206:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16.hpp
deleted file mode 100644
index feda7d707a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef __aarch64__
-
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<int32_t>, \
- const int32_t *, Activation, bool
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_hybrid_u8s8s32_mmla_6x16( ARGLIST );
-
-class cls_a64_hybrid_u8s8s32_mmla_6x16
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef int32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 6;
- }
-
- static unsigned int out_width()
- {
- return 16;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 8;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 8> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint32_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 55.05 };
- case CPUModel::A510:
- return { 30.34 };
- case CPUModel::V1:
- return { 83.77 };
- }
- }
-
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 55.31, 15.72, 0.62 };
- case CPUModel::A510:
- return { 33.64, 3.92, 0.48 };
- case CPUModel::V1:
- return { 63.94, 16.18, 0.83 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_hybrid_u8s8s32_mmla_6x16;
- cls_a64_hybrid_u8s8s32_mmla_6x16(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16/generic.cpp
deleted file mode 100644
index 32fa470d9e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8s32_mmla_6x16/generic.cpp
+++ /dev/null
@@ -1,3450 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-
-namespace arm_gemm {
-
-void a64_hybrid_u8s8s32_mmla_6x16 (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
- const int32_t *, Activation, bool accumulate
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- if (accumulate) {
- flags |= 0x1;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- __asm__ __volatile__(
- "1:" // Row loop
- "cmp %x[M], #0x6\n"
- "bge 186f\n"
- "cmp %x[M], #0x4\n"
- "bgt 149f\n"
- "beq 112f\n"
- "cmp %x[M], #0x2\n"
- "bgt 75f\n"
- "beq 38f\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "tbz %x[flags], #0, 13f\n"
- "cmp x11, #0x10\n"
- "bge 11f\n"
- "tbz x11, #3, 6f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "tbz x11, #2, 4f\n"
- "ld1 { v11.4s }, [x9], #0x10\n"
- "tbz x11, #1, 3f\n"
- "ldr d16, [x9], #0x8\n"
- "mov x25, #0x38\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v16.s }[2], [x9]\n"
- "b 10f\n"
- "3:" // Height 1: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 10f\n"
- "ldr s16, [x9, #0x0]\n"
- "b 10f\n"
- "4:" // Height 1: Partial accumulate: partial_2_8
- "tbz x11, #1, 5f\n"
- "ldr d11, [x9], #0x8\n"
- "mov x25, #0x28\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "b 10f\n"
- "5:" // Height 1: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 10f\n"
- "ldr s11, [x9, #0x0]\n"
- "b 10f\n"
- "6:" // Height 1: Partial accumulate: partial_4_0
- "tbz x11, #2, 8f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "tbz x11, #1, 7f\n"
- "ldr d10, [x9], #0x8\n"
- "mov x25, #0x18\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "b 10f\n"
- "7:" // Height 1: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 10f\n"
- "ldr s10, [x9, #0x0]\n"
- "b 10f\n"
- "8:" // Height 1: Partial accumulate: partial_2_0
- "tbz x11, #1, 9f\n"
- "ldr d9, [x9], #0x8\n"
- "mov x25, #0x8\n"
- "tbz x11, #0, 10f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "b 10f\n"
- "9:" // Height 1: Partial accumulate: partial_1_0
- "ldr s9, [x9, #0x0]\n"
- "mov x25, #0x0\n"
- "10:" // Height 1: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 12f\n"
- "11:" // Height 1: full accumulate
- "ldr q9, [x9, #0x0]\n"
- "ldr q10, [x9, #0x10]\n"
- "ldr q11, [x9, #0x20]\n"
- "ldr q16, [x9, #0x30]\n"
- "12:" // Height 1: MMLA fixup
- "zip1 v8.2d, v9.2d, v12.2d\n"
- "zip2 v12.2d, v9.2d, v12.2d\n"
- "zip1 v9.2d, v10.2d, v13.2d\n"
- "zip2 v13.2d, v10.2d, v13.2d\n"
- "zip1 v10.2d, v11.2d, v14.2d\n"
- "zip2 v14.2d, v11.2d, v14.2d\n"
- "zip1 v11.2d, v16.2d, v15.2d\n"
- "zip2 v15.2d, v16.2d, v15.2d\n"
- "b 14f\n"
- "13:" // Height 1: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "14:" // Height 1: setup done
- "mov x28, #0x0\n"
- "15:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 16f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 17f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "b 17f\n"
- "16:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "17:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "blt 20f\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q6, [x10, #0x10]\n"
- "blt 19f\n"
- "18:" // Height 1: Multiply loop: Main loop head
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "trn1 v19.2d, v1.2d, v20.2d\n"
- "trn2 v1.2d, v1.2d, v20.2d\n"
- ".inst 0x4e87ae68 // usmmla v8.4s, v19.16b, v7.16b\n"
- "ldr q18, [x10, #0x20]\n"
- ".inst 0x4e86ae6c // usmmla v12.4s, v19.16b, v6.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e92ae69 // usmmla v9.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91ae6d // usmmla v13.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92ae6a // usmmla v10.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- ".inst 0x4e92ae6b // usmmla v11.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x80]\n"
- ".inst 0x4e91ae6f // usmmla v15.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x90]\n"
- ".inst 0x4e92ac28 // usmmla v8.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xa0]\n"
- ".inst 0x4e91ac2c // usmmla v12.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xb0]\n"
- ".inst 0x4e92ac29 // usmmla v9.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xc0]\n"
- ".inst 0x4e91ac2d // usmmla v13.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xd0]\n"
- ".inst 0x4e92ac2a // usmmla v10.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xe0]\n"
- ".inst 0x4e91ac2e // usmmla v14.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e92ac2b // usmmla v11.4s, v1.16b, v18.16b\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x4e91ac2f // usmmla v15.4s, v1.16b, v17.16b\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "bge 18b\n"
- "19:" // Height 1: Multiply loop: Single iteration only
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "trn1 v19.2d, v1.2d, v17.2d\n"
- "trn2 v1.2d, v1.2d, v17.2d\n"
- ".inst 0x4e87ae68 // usmmla v8.4s, v19.16b, v7.16b\n"
- "ldr q18, [x10, #0x20]\n"
- ".inst 0x4e86ae6c // usmmla v12.4s, v19.16b, v6.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e92ae69 // usmmla v9.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91ae6d // usmmla v13.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92ae6a // usmmla v10.4s, v19.16b, v18.16b\n"
- "ldr q20, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q18, [x10, #0x70]\n"
- ".inst 0x4e94ae6b // usmmla v11.4s, v19.16b, v20.16b\n"
- "ldr q17, [x10, #0x80]\n"
- ".inst 0x4e92ae6f // usmmla v15.4s, v19.16b, v18.16b\n"
- "ldr q20, [x10, #0x90]\n"
- ".inst 0x4e91ac28 // usmmla v8.4s, v1.16b, v17.16b\n"
- "ldr q18, [x10, #0xa0]\n"
- ".inst 0x4e94ac2c // usmmla v12.4s, v1.16b, v20.16b\n"
- "ldr q17, [x10, #0xb0]\n"
- ".inst 0x4e92ac29 // usmmla v9.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xc0]\n"
- ".inst 0x4e91ac2d // usmmla v13.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xd0]\n"
- ".inst 0x4e92ac2a // usmmla v10.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xe0]\n"
- ".inst 0x4e91ac2e // usmmla v14.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e92ac2b // usmmla v11.4s, v1.16b, v18.16b\n"
- ".inst 0x4e91ac2f // usmmla v15.4s, v1.16b, v17.16b\n"
- "20:" // Height 1: Multiply loop: Main loop skip
- "cbz x27, 27f\n"
- "cmp x27, #0x8\n"
- "blt 22f\n"
- "21:" // Height 1: Multiply loop: Odd block loop
- "ldr d19, [x26], #0x8\n"
- "ldr q20, [x10, #0x0]\n"
- "sub x27, x27, #0x8\n"
- "ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v19.2d, v19.2d, v18.2d\n"
- ".inst 0x4e94ae68 // usmmla v8.4s, v19.16b, v20.16b\n"
- "ldr q18, [x10, #0x20]\n"
- ".inst 0x4e91ae6c // usmmla v12.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e92ae69 // usmmla v9.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91ae6d // usmmla v13.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92ae6a // usmmla v10.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e92ae6b // usmmla v11.4s, v19.16b, v18.16b\n"
- ".inst 0x4e91ae6f // usmmla v15.4s, v19.16b, v17.16b\n"
- "bge 21b\n"
- "22:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x27, 27f\n"
- "tbz x27, #2, 24f\n"
- "ldr s1, [x26], #0x4\n"
- "tbz x27, #1, 23f\n"
- "ld1 { v1.h }[2], [x26], #0x2\n"
- "tbz x27, #0, 26f\n"
- "ld1 { v1.b }[6], [x26]\n"
- "b 26f\n"
- "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
- "tbz x27, #0, 26f\n"
- "ld1 { v1.b }[4], [x26]\n"
- "b 26f\n"
- "24:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
- "tbz x27, #1, 25f\n"
- "ldr h1, [x26], #0x2\n"
- "tbz x27, #0, 26f\n"
- "ld1 { v1.b }[2], [x26]\n"
- "b 26f\n"
- "25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x26, #0x0]\n"
- "26:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q24, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
- "trn1 v19.2d, v1.2d, v17.2d\n"
- ".inst 0x4e98ae68 // usmmla v8.4s, v19.16b, v24.16b\n"
- "ldr q17, [x10, #0x20]\n"
- ".inst 0x4e94ae6c // usmmla v12.4s, v19.16b, v20.16b\n"
- "ldr q0, [x10, #0x30]\n"
- ".inst 0x4e91ae69 // usmmla v9.4s, v19.16b, v17.16b\n"
- "ldr q20, [x10, #0x40]\n"
- ".inst 0x4e80ae6d // usmmla v13.4s, v19.16b, v0.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e94ae6a // usmmla v10.4s, v19.16b, v20.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e92ae6b // usmmla v11.4s, v19.16b, v18.16b\n"
- ".inst 0x4e91ae6f // usmmla v15.4s, v19.16b, v17.16b\n"
- "27:" // Height 1: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 15b\n"
- "cmp x11, #0x10\n"
- "uzp1 v8.2d, v8.2d, v12.2d\n"
- "uzp1 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "uzp1 v10.2d, v10.2d, v14.2d\n"
- "uzp1 v11.2d, v11.2d, v15.2d\n"
- "bge 36f\n"
- "tbz x11, #3, 31f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "st1 { v9.4s }, [x9], #0x10\n"
- "tbz x11, #2, 29f\n"
- "st1 { v10.4s }, [x9], #0x10\n"
- "tbz x11, #1, 28f\n"
- "str d11, [x9], #0x8\n"
- "tbz x11, #0, 35f\n"
- "st1 { v11.s }[2], [x9]\n"
- "b 35f\n"
- "28:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x11, #0, 35f\n"
- "str s11, [x9, #0x0]\n"
- "b 35f\n"
- "29:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x11, #1, 30f\n"
- "str d10, [x9], #0x8\n"
- "tbz x11, #0, 35f\n"
- "st1 { v10.s }[2], [x9]\n"
- "b 35f\n"
- "30:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x11, #0, 35f\n"
- "str s10, [x9, #0x0]\n"
- "b 35f\n"
- "31:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x11, #2, 33f\n"
- "st1 { v8.4s }, [x9], #0x10\n"
- "tbz x11, #1, 32f\n"
- "str d9, [x9], #0x8\n"
- "tbz x11, #0, 35f\n"
- "st1 { v9.s }[2], [x9]\n"
- "b 35f\n"
- "32:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x11, #0, 35f\n"
- "str s9, [x9, #0x0]\n"
- "b 35f\n"
- "33:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x11, #1, 34f\n"
- "str d8, [x9], #0x8\n"
- "tbz x11, #0, 35f\n"
- "st1 { v8.s }[2], [x9]\n"
- "b 35f\n"
- "34:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x9, #0x0]\n"
- "35:" // Height 1: Partial direct writeback: Done
- "b 37f\n"
- "36:" // Height 1: Full writeback
- "str q8, [x9, #0x0]\n"
- "str q9, [x9, #0x10]\n"
- "str q10, [x9, #0x20]\n"
- "str q11, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "37:" // Height 1: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 2b\n"
- "b 224f\n"
- "38:" // Height 2
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "39:" // Height 2: Column loop
- "tbz %x[flags], #0, 50f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "bge 48f\n"
- "tbz x11, #3, 43f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "tbz x11, #2, 41f\n"
- "ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "tbz x11, #1, 40f\n"
- "ldr d16, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "tbz x11, #0, 47f\n"
- "ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "b 47f\n"
- "40:" // Height 2: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 47f\n"
- "ldr s16, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "b 47f\n"
- "41:" // Height 2: Partial accumulate: partial_2_8
- "tbz x11, #1, 42f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "tbz x11, #0, 47f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "b 47f\n"
- "42:" // Height 2: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 47f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "b 47f\n"
- "43:" // Height 2: Partial accumulate: partial_4_0
- "tbz x11, #2, 45f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "tbz x11, #1, 44f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "tbz x11, #0, 47f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "b 47f\n"
- "44:" // Height 2: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 47f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "b 47f\n"
- "45:" // Height 2: Partial accumulate: partial_2_0
- "tbz x11, #1, 46f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "tbz x11, #0, 47f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "b 47f\n"
- "46:" // Height 2: Partial accumulate: partial_1_0
- "ldr s9, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "47:" // Height 2: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 49f\n"
- "48:" // Height 2: full accumulate
- "ldr q9, [x9, #0x0]\n"
- "ldr q10, [x9, #0x10]\n"
- "ldr q11, [x9, #0x20]\n"
- "ldr q16, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "49:" // Height 2: MMLA fixup
- "zip1 v8.2d, v9.2d, v12.2d\n"
- "zip2 v12.2d, v9.2d, v12.2d\n"
- "zip1 v9.2d, v10.2d, v13.2d\n"
- "zip2 v13.2d, v10.2d, v13.2d\n"
- "zip1 v10.2d, v11.2d, v14.2d\n"
- "zip2 v14.2d, v11.2d, v14.2d\n"
- "zip1 v11.2d, v16.2d, v15.2d\n"
- "zip2 v15.2d, v16.2d, v15.2d\n"
- "b 51f\n"
- "50:" // Height 2: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "51:" // Height 2: setup done
- "mov x28, #0x0\n"
- "52:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 53f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "cbnz x28, 54f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "b 54f\n"
- "53:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "54:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "blt 57f\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "blt 56f\n"
- "55:" // Height 2: Multiply loop: Main loop head
- "trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q2, [x25, #0x0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87ae68 // usmmla v8.4s, v19.16b, v7.16b\n"
- "ldr q18, [x10, #0x20]\n"
- ".inst 0x4e86ae6c // usmmla v12.4s, v19.16b, v6.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e92ae69 // usmmla v9.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91ae6d // usmmla v13.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92ae6a // usmmla v10.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- ".inst 0x4e92ae6b // usmmla v11.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x80]\n"
- ".inst 0x4e91ae6f // usmmla v15.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x90]\n"
- ".inst 0x4e92ac28 // usmmla v8.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xa0]\n"
- ".inst 0x4e91ac2c // usmmla v12.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xb0]\n"
- ".inst 0x4e92ac29 // usmmla v9.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xc0]\n"
- ".inst 0x4e91ac2d // usmmla v13.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xd0]\n"
- ".inst 0x4e92ac2a // usmmla v10.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xe0]\n"
- ".inst 0x4e91ac2e // usmmla v14.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e92ac2b // usmmla v11.4s, v1.16b, v18.16b\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x4e91ac2f // usmmla v15.4s, v1.16b, v17.16b\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "bge 55b\n"
- "56:" // Height 2: Multiply loop: Single iteration only
- "trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87ae68 // usmmla v8.4s, v19.16b, v7.16b\n"
- "ldr q18, [x10, #0x20]\n"
- ".inst 0x4e86ae6c // usmmla v12.4s, v19.16b, v6.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e92ae69 // usmmla v9.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91ae6d // usmmla v13.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92ae6a // usmmla v10.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- ".inst 0x4e92ae6b // usmmla v11.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x80]\n"
- ".inst 0x4e91ae6f // usmmla v15.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x90]\n"
- ".inst 0x4e92ac28 // usmmla v8.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xa0]\n"
- ".inst 0x4e91ac2c // usmmla v12.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xb0]\n"
- ".inst 0x4e92ac29 // usmmla v9.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xc0]\n"
- ".inst 0x4e91ac2d // usmmla v13.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xd0]\n"
- ".inst 0x4e92ac2a // usmmla v10.4s, v1.16b, v18.16b\n"
- "ldr q18, [x10, #0xe0]\n"
- ".inst 0x4e91ac2e // usmmla v14.4s, v1.16b, v17.16b\n"
- "ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e92ac2b // usmmla v11.4s, v1.16b, v18.16b\n"
- ".inst 0x4e91ac2f // usmmla v15.4s, v1.16b, v17.16b\n"
- "57:" // Height 2: Multiply loop: Main loop skip
- "cbz x27, 64f\n"
- "cmp x27, #0x8\n"
- "blt 59f\n"
- "58:" // Height 2: Multiply loop: Odd block loop
- "ldr d20, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr q18, [x10, #0x0]\n"
- "ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v22.2d, v20.2d, v19.2d\n"
- ".inst 0x4e92aec8 // usmmla v8.4s, v22.16b, v18.16b\n"
- "ldr q2, [x10, #0x20]\n"
- ".inst 0x4e91aecc // usmmla v12.4s, v22.16b, v17.16b\n"
- "ldr q17, [x10, #0x30]\n"
- ".inst 0x4e82aec9 // usmmla v9.4s, v22.16b, v2.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e91aecd // usmmla v13.4s, v22.16b, v17.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92aeca // usmmla v10.4s, v22.16b, v18.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91aece // usmmla v14.4s, v22.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e92aecb // usmmla v11.4s, v22.16b, v18.16b\n"
- ".inst 0x4e91aecf // usmmla v15.4s, v22.16b, v17.16b\n"
- "bge 58b\n"
- "59:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x27, 64f\n"
- "tbz x27, #2, 61f\n"
- "ldr s1, [x26], #0x4\n"
- "ldr s2, [x25], #0x4\n"
- "tbz x27, #1, 60f\n"
- "ld1 { v1.h }[2], [x26], #0x2\n"
- "ld1 { v2.h }[2], [x25], #0x2\n"
- "tbz x27, #0, 63f\n"
- "ld1 { v1.b }[6], [x26]\n"
- "ld1 { v2.b }[6], [x25]\n"
- "b 63f\n"
- "60:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
- "tbz x27, #0, 63f\n"
- "ld1 { v1.b }[4], [x26]\n"
- "ld1 { v2.b }[4], [x25]\n"
- "b 63f\n"
- "61:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
- "tbz x27, #1, 62f\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h2, [x25], #0x2\n"
- "tbz x27, #0, 63f\n"
- "ld1 { v1.b }[2], [x26]\n"
- "ld1 { v2.b }[2], [x25]\n"
- "b 63f\n"
- "62:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x26, #0x0]\n"
- "ldr b2, [x25, #0x0]\n"
- "63:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q18, [x10, #0x0]\n"
- "ldr q17, [x10, #0x10]\n"
- "trn1 v19.2d, v1.2d, v2.2d\n"
- ".inst 0x4e92ae68 // usmmla v8.4s, v19.16b, v18.16b\n"
- "ldr q5, [x10, #0x20]\n"
- ".inst 0x4e91ae6c // usmmla v12.4s, v19.16b, v17.16b\n"
- "ldr q21, [x10, #0x30]\n"
- ".inst 0x4e85ae69 // usmmla v9.4s, v19.16b, v5.16b\n"
- "ldr q18, [x10, #0x40]\n"
- ".inst 0x4e95ae6d // usmmla v13.4s, v19.16b, v21.16b\n"
- "ldr q17, [x10, #0x50]\n"
- ".inst 0x4e92ae6a // usmmla v10.4s, v19.16b, v18.16b\n"
- "ldr q18, [x10, #0x60]\n"
- ".inst 0x4e91ae6e // usmmla v14.4s, v19.16b, v17.16b\n"
- "ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e92ae6b // usmmla v11.4s, v19.16b, v18.16b\n"
- ".inst 0x4e91ae6f // usmmla v15.4s, v19.16b, v17.16b\n"
- "64:" // Height 2: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 52b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
- "uzp2 v8.2d, v8.2d, v12.2d\n"
- "uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "uzp1 v13.2d, v10.2d, v14.2d\n"
- "uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "bge 73f\n"
- "tbz x11, #3, 68f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "tbz x11, #2, 66f\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "tbz x11, #1, 65f\n"
- "str d14, [x9], #0x8\n"
- "str d11, [x24], #0x8\n"
- "tbz x11, #0, 72f\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x24]\n"
- "b 72f\n"
- "65:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x11, #0, 72f\n"
- "str s14, [x9, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "b 72f\n"
- "66:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x11, #1, 67f\n"
- "str d13, [x9], #0x8\n"
- "str d10, [x24], #0x8\n"
- "tbz x11, #0, 72f\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x24]\n"
- "b 72f\n"
- "67:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x11, #0, 72f\n"
- "str s13, [x9, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "b 72f\n"
- "68:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x11, #2, 70f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "tbz x11, #1, 69f\n"
- "str d12, [x9], #0x8\n"
- "str d9, [x24], #0x8\n"
- "tbz x11, #0, 72f\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x24]\n"
- "b 72f\n"
- "69:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x11, #0, 72f\n"
- "str s12, [x9, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "b 72f\n"
- "70:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x11, #1, 71f\n"
- "str d7, [x9], #0x8\n"
- "str d8, [x24], #0x8\n"
- "tbz x11, #0, 72f\n"
- "st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x24]\n"
- "b 72f\n"
- "71:" // Height 2: Partial direct writeback: partial_1_0
- "str s7, [x9, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "72:" // Height 2: Partial direct writeback: Done
- "b 74f\n"
- "73:" // Height 2: Full writeback
- "str q7, [x9, #0x0]\n"
- "str q12, [x9, #0x10]\n"
- "str q13, [x9, #0x20]\n"
- "str q14, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "74:" // Height 2: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 39b\n"
- "b 224f\n"
- "75:" // Height 3
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "76:" // Height 3: Column loop
- "tbz %x[flags], #0, 87f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "bge 85f\n"
- "tbz x11, #3, 80f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "tbz x11, #2, 78f\n"
- "ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v19.4s }, [x23], #0x10\n"
- "tbz x11, #1, 77f\n"
- "ldr d16, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d24, [x23], #0x8\n"
- "tbz x11, #0, 84f\n"
- "ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "b 84f\n"
- "77:" // Height 3: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 84f\n"
- "ldr s16, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "b 84f\n"
- "78:" // Height 3: Partial accumulate: partial_2_8
- "tbz x11, #1, 79f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d19, [x23], #0x8\n"
- "tbz x11, #0, 84f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "b 84f\n"
- "79:" // Height 3: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 84f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "b 84f\n"
- "80:" // Height 3: Partial accumulate: partial_4_0
- "tbz x11, #2, 82f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "tbz x11, #1, 81f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d18, [x23], #0x8\n"
- "tbz x11, #0, 84f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "b 84f\n"
- "81:" // Height 3: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 84f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "b 84f\n"
- "82:" // Height 3: Partial accumulate: partial_2_0
- "tbz x11, #1, 83f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d17, [x23], #0x8\n"
- "tbz x11, #0, 84f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "b 84f\n"
- "83:" // Height 3: Partial accumulate: partial_1_0
- "ldr s9, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s17, [x23, #0x0]\n"
- "84:" // Height 3: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 86f\n"
- "85:" // Height 3: full accumulate
- "ldr q9, [x9, #0x0]\n"
- "ldr q10, [x9, #0x10]\n"
- "ldr q11, [x9, #0x20]\n"
- "ldr q16, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q17, [x23, #0x0]\n"
- "ldr q18, [x23, #0x10]\n"
- "ldr q19, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "86:" // Height 3: MMLA fixup
- "zip1 v8.2d, v9.2d, v12.2d\n"
- "zip2 v12.2d, v9.2d, v12.2d\n"
- "zip1 v9.2d, v10.2d, v13.2d\n"
- "zip2 v13.2d, v10.2d, v13.2d\n"
- "zip1 v10.2d, v11.2d, v14.2d\n"
- "zip2 v14.2d, v11.2d, v14.2d\n"
- "zip1 v11.2d, v16.2d, v15.2d\n"
- "zip2 v15.2d, v16.2d, v15.2d\n"
- "zip1 v16.2d, v17.2d, v20.2d\n"
- "zip2 v20.2d, v17.2d, v20.2d\n"
- "zip1 v17.2d, v18.2d, v21.2d\n"
- "zip2 v21.2d, v18.2d, v21.2d\n"
- "zip1 v18.2d, v19.2d, v22.2d\n"
- "zip2 v22.2d, v19.2d, v22.2d\n"
- "zip1 v19.2d, v24.2d, v23.2d\n"
- "zip2 v23.2d, v24.2d, v23.2d\n"
- "b 88f\n"
- "87:" // Height 3: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "88:" // Height 3: setup done
- "mov x28, #0x0\n"
- "89:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 90f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "cbnz x28, 91f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "b 91f\n"
- "90:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "91:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "blt 94f\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "blt 93f\n"
- "92:" // Height 3: Multiply loop: Main loop head
- "trn1 v28.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
- ".inst 0x4e87af88 // usmmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86af8c // usmmla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
- ".inst 0x4e87af70 // usmmla v16.4s, v27.16b, v7.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e86af74 // usmmla v20.4s, v27.16b, v6.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x80]\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x90]\n"
- "ldr q2, [x25, #0x0]\n"
- ".inst 0x4e9aac28 // usmmla v8.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac70 // usmmla v16.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xa0]\n"
- ".inst 0x4e99ac2c // usmmla v12.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac74 // usmmla v20.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xb0]\n"
- ".inst 0x4e9aac29 // usmmla v9.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac71 // usmmla v17.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xc0]\n"
- ".inst 0x4e99ac2d // usmmla v13.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac75 // usmmla v21.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xd0]\n"
- ".inst 0x4e9aac2a // usmmla v10.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac72 // usmmla v18.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xe0]\n"
- ".inst 0x4e99ac2e // usmmla v14.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac76 // usmmla v22.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e9aac2b // usmmla v11.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac73 // usmmla v19.4s, v3.16b, v26.16b\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x4e99ac2f // usmmla v15.4s, v1.16b, v25.16b\n"
- "ldr q1, [x26, #0x0]\n"
- ".inst 0x4e99ac77 // usmmla v23.4s, v3.16b, v25.16b\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "bge 92b\n"
- "93:" // Height 3: Multiply loop: Single iteration only
- "trn1 v28.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
- ".inst 0x4e87af88 // usmmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86af8c // usmmla v12.4s, v28.16b, v6.16b\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
- ".inst 0x4e87af70 // usmmla v16.4s, v27.16b, v7.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e86af74 // usmmla v20.4s, v27.16b, v6.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x80]\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x90]\n"
- ".inst 0x4e9aac28 // usmmla v8.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac70 // usmmla v16.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xa0]\n"
- ".inst 0x4e99ac2c // usmmla v12.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac74 // usmmla v20.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xb0]\n"
- ".inst 0x4e9aac29 // usmmla v9.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac71 // usmmla v17.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xc0]\n"
- ".inst 0x4e99ac2d // usmmla v13.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac75 // usmmla v21.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xd0]\n"
- ".inst 0x4e9aac2a // usmmla v10.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac72 // usmmla v18.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xe0]\n"
- ".inst 0x4e99ac2e // usmmla v14.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac76 // usmmla v22.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e9aac2b // usmmla v11.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac73 // usmmla v19.4s, v3.16b, v26.16b\n"
- ".inst 0x4e99ac2f // usmmla v15.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac77 // usmmla v23.4s, v3.16b, v25.16b\n"
- "94:" // Height 3: Multiply loop: Main loop skip
- "cbz x27, 101f\n"
- "cmp x27, #0x8\n"
- "blt 96f\n"
- "95:" // Height 3: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d27, [x24], #0x8\n"
- "ldr q26, [x10, #0x0]\n"
- "cmp x27, #0x8\n"
- "ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v27.2d, v29.2d\n"
- ".inst 0x4e9aaf88 // usmmla v8.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf70 // usmmla v16.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e99af8c // usmmla v12.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af74 // usmmla v20.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "bge 95b\n"
- "96:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x27, 101f\n"
- "tbz x27, #2, 98f\n"
- "ldr s1, [x26], #0x4\n"
- "ldr s2, [x25], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "tbz x27, #1, 97f\n"
- "ld1 { v1.h }[2], [x26], #0x2\n"
- "ld1 { v2.h }[2], [x25], #0x2\n"
- "ld1 { v3.h }[2], [x24], #0x2\n"
- "tbz x27, #0, 100f\n"
- "ld1 { v1.b }[6], [x26]\n"
- "ld1 { v2.b }[6], [x25]\n"
- "ld1 { v3.b }[6], [x24]\n"
- "b 100f\n"
- "97:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
- "tbz x27, #0, 100f\n"
- "ld1 { v1.b }[4], [x26]\n"
- "ld1 { v2.b }[4], [x25]\n"
- "ld1 { v3.b }[4], [x24]\n"
- "b 100f\n"
- "98:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
- "tbz x27, #1, 99f\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h2, [x25], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "tbz x27, #0, 100f\n"
- "ld1 { v1.b }[2], [x26]\n"
- "ld1 { v2.b }[2], [x25]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "b 100f\n"
- "99:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x26, #0x0]\n"
- "ldr b2, [x25, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "100:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q26, [x10, #0x0]\n"
- "ldr q29, [x10, #0x10]\n"
- "trn1 v28.2d, v1.2d, v2.2d\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
- ".inst 0x4e9aaf88 // usmmla v8.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9daf8c // usmmla v12.4s, v28.16b, v29.16b\n"
- ".inst 0x4e9aaf70 // usmmla v16.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e9daf74 // usmmla v20.4s, v27.16b, v29.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "101:" // Height 3: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 89b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
- "uzp2 v8.2d, v8.2d, v12.2d\n"
- "uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "uzp1 v13.2d, v10.2d, v14.2d\n"
- "uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "uzp1 v16.2d, v16.2d, v20.2d\n"
- "uzp1 v17.2d, v17.2d, v21.2d\n"
- "uzp1 v18.2d, v18.2d, v22.2d\n"
- "uzp1 v19.2d, v19.2d, v23.2d\n"
- "bge 110f\n"
- "tbz x11, #3, 105f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "st1 { v17.4s }, [x23], #0x10\n"
- "tbz x11, #2, 103f\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v18.4s }, [x23], #0x10\n"
- "tbz x11, #1, 102f\n"
- "str d14, [x9], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d19, [x23], #0x8\n"
- "tbz x11, #0, 109f\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v19.s }[2], [x23]\n"
- "b 109f\n"
- "102:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x11, #0, 109f\n"
- "str s14, [x9, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s19, [x23, #0x0]\n"
- "b 109f\n"
- "103:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x11, #1, 104f\n"
- "str d13, [x9], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d18, [x23], #0x8\n"
- "tbz x11, #0, 109f\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v18.s }[2], [x23]\n"
- "b 109f\n"
- "104:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x11, #0, 109f\n"
- "str s13, [x9, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s18, [x23, #0x0]\n"
- "b 109f\n"
- "105:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x11, #2, 107f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v16.4s }, [x23], #0x10\n"
- "tbz x11, #1, 106f\n"
- "str d12, [x9], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d17, [x23], #0x8\n"
- "tbz x11, #0, 109f\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v17.s }[2], [x23]\n"
- "b 109f\n"
- "106:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x11, #0, 109f\n"
- "str s12, [x9, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s17, [x23, #0x0]\n"
- "b 109f\n"
- "107:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x11, #1, 108f\n"
- "str d7, [x9], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "tbz x11, #0, 109f\n"
- "st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v16.s }[2], [x23]\n"
- "b 109f\n"
- "108:" // Height 3: Partial direct writeback: partial_1_0
- "str s7, [x9, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s16, [x23, #0x0]\n"
- "109:" // Height 3: Partial direct writeback: Done
- "b 111f\n"
- "110:" // Height 3: Full writeback
- "str q7, [x9, #0x0]\n"
- "str q12, [x9, #0x10]\n"
- "str q13, [x9, #0x20]\n"
- "str q14, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q18, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "111:" // Height 3: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 76b\n"
- "b 224f\n"
- "112:" // Height 4
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "113:" // Height 4: Column loop
- "tbz %x[flags], #0, 124f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "bge 122f\n"
- "tbz x11, #3, 117f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "ld1 { v21.4s }, [x22], #0x10\n"
- "tbz x11, #2, 115f\n"
- "ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v19.4s }, [x23], #0x10\n"
- "ld1 { v22.4s }, [x22], #0x10\n"
- "tbz x11, #1, 114f\n"
- "ldr d16, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "tbz x11, #0, 121f\n"
- "ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "b 121f\n"
- "114:" // Height 4: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 121f\n"
- "ldr s16, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "b 121f\n"
- "115:" // Height 4: Partial accumulate: partial_2_8
- "tbz x11, #1, 116f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d19, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "tbz x11, #0, 121f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "ld1 { v22.s }[2], [x22]\n"
- "b 121f\n"
- "116:" // Height 4: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 121f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "ldr s22, [x22, #0x0]\n"
- "b 121f\n"
- "117:" // Height 4: Partial accumulate: partial_4_0
- "tbz x11, #2, 119f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "tbz x11, #1, 118f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "tbz x11, #0, 121f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "b 121f\n"
- "118:" // Height 4: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 121f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "b 121f\n"
- "119:" // Height 4: Partial accumulate: partial_2_0
- "tbz x11, #1, 120f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "tbz x11, #0, 121f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "b 121f\n"
- "120:" // Height 4: Partial accumulate: partial_1_0
- "ldr s9, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "121:" // Height 4: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 123f\n"
- "122:" // Height 4: full accumulate
- "ldr q9, [x9, #0x0]\n"
- "ldr q10, [x9, #0x10]\n"
- "ldr q11, [x9, #0x20]\n"
- "ldr q16, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q17, [x23, #0x0]\n"
- "ldr q18, [x23, #0x10]\n"
- "ldr q19, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q20, [x22, #0x0]\n"
- "ldr q21, [x22, #0x10]\n"
- "ldr q22, [x22, #0x20]\n"
- "ldr q23, [x22, #0x30]\n"
- "123:" // Height 4: MMLA fixup
- "zip1 v8.2d, v9.2d, v12.2d\n"
- "zip2 v12.2d, v9.2d, v12.2d\n"
- "zip1 v9.2d, v10.2d, v13.2d\n"
- "zip2 v13.2d, v10.2d, v13.2d\n"
- "zip1 v10.2d, v11.2d, v14.2d\n"
- "zip2 v14.2d, v11.2d, v14.2d\n"
- "zip1 v11.2d, v16.2d, v15.2d\n"
- "zip2 v15.2d, v16.2d, v15.2d\n"
- "zip1 v16.2d, v17.2d, v20.2d\n"
- "zip2 v20.2d, v17.2d, v20.2d\n"
- "zip1 v17.2d, v18.2d, v21.2d\n"
- "zip2 v21.2d, v18.2d, v21.2d\n"
- "zip1 v18.2d, v19.2d, v22.2d\n"
- "zip2 v22.2d, v19.2d, v22.2d\n"
- "zip1 v19.2d, v24.2d, v23.2d\n"
- "zip2 v23.2d, v24.2d, v23.2d\n"
- "b 125f\n"
- "124:" // Height 4: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "125:" // Height 4: setup done
- "mov x28, #0x0\n"
- "126:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 127f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "cbnz x28, 128f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 128f\n"
- "127:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "128:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "blt 131f\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "blt 130f\n"
- "129:" // Height 4: Multiply loop: Main loop head
- "trn1 v28.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87af88 // usmmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86af8c // usmmla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e87af70 // usmmla v16.4s, v27.16b, v7.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e86af74 // usmmla v20.4s, v27.16b, v6.16b\n"
- "ldr q25, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x80]\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x90]\n"
- "ldr q2, [x25, #0x0]\n"
- ".inst 0x4e9aac28 // usmmla v8.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac70 // usmmla v16.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xa0]\n"
- ".inst 0x4e99ac2c // usmmla v12.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac74 // usmmla v20.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xb0]\n"
- ".inst 0x4e9aac29 // usmmla v9.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac71 // usmmla v17.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xc0]\n"
- ".inst 0x4e99ac2d // usmmla v13.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac75 // usmmla v21.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xd0]\n"
- ".inst 0x4e9aac2a // usmmla v10.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac72 // usmmla v18.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xe0]\n"
- ".inst 0x4e99ac2e // usmmla v14.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac76 // usmmla v22.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e9aac2b // usmmla v11.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac73 // usmmla v19.4s, v3.16b, v26.16b\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x4e99ac2f // usmmla v15.4s, v1.16b, v25.16b\n"
- "ldr q1, [x26, #0x0]\n"
- ".inst 0x4e99ac77 // usmmla v23.4s, v3.16b, v25.16b\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "bge 129b\n"
- "130:" // Height 4: Multiply loop: Single iteration only
- "trn1 v28.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "sub x27, x27, #0x10\n"
- ".inst 0x4e87af88 // usmmla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x4e86af8c // usmmla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e87af70 // usmmla v16.4s, v27.16b, v7.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e86af74 // usmmla v20.4s, v27.16b, v6.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x80]\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x90]\n"
- ".inst 0x4e9aac28 // usmmla v8.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac70 // usmmla v16.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xa0]\n"
- ".inst 0x4e99ac2c // usmmla v12.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac74 // usmmla v20.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xb0]\n"
- ".inst 0x4e9aac29 // usmmla v9.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac71 // usmmla v17.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xc0]\n"
- ".inst 0x4e99ac2d // usmmla v13.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac75 // usmmla v21.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xd0]\n"
- ".inst 0x4e9aac2a // usmmla v10.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac72 // usmmla v18.4s, v3.16b, v26.16b\n"
- "ldr q26, [x10, #0xe0]\n"
- ".inst 0x4e99ac2e // usmmla v14.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac76 // usmmla v22.4s, v3.16b, v25.16b\n"
- "ldr q25, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e9aac2b // usmmla v11.4s, v1.16b, v26.16b\n"
- ".inst 0x4e9aac73 // usmmla v19.4s, v3.16b, v26.16b\n"
- ".inst 0x4e99ac2f // usmmla v15.4s, v1.16b, v25.16b\n"
- ".inst 0x4e99ac77 // usmmla v23.4s, v3.16b, v25.16b\n"
- "131:" // Height 4: Multiply loop: Main loop skip
- "cbz x27, 138f\n"
- "cmp x27, #0x8\n"
- "blt 133f\n"
- "132:" // Height 4: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "cmp x27, #0x8\n"
- "ldr q26, [x10, #0x0]\n"
- "ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
- ".inst 0x4e9aaf88 // usmmla v8.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf70 // usmmla v16.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e99af8c // usmmla v12.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af74 // usmmla v20.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "bge 132b\n"
- "133:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x27, 138f\n"
- "tbz x27, #2, 135f\n"
- "ldr s1, [x26], #0x4\n"
- "ldr s2, [x25], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "tbz x27, #1, 134f\n"
- "ld1 { v1.h }[2], [x26], #0x2\n"
- "ld1 { v2.h }[2], [x25], #0x2\n"
- "ld1 { v3.h }[2], [x24], #0x2\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "tbz x27, #0, 137f\n"
- "ld1 { v1.b }[6], [x26]\n"
- "ld1 { v2.b }[6], [x25]\n"
- "ld1 { v3.b }[6], [x24]\n"
- "ld1 { v4.b }[6], [x23]\n"
- "b 137f\n"
- "134:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
- "tbz x27, #0, 137f\n"
- "ld1 { v1.b }[4], [x26]\n"
- "ld1 { v2.b }[4], [x25]\n"
- "ld1 { v3.b }[4], [x24]\n"
- "ld1 { v4.b }[4], [x23]\n"
- "b 137f\n"
- "135:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
- "tbz x27, #1, 136f\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h2, [x25], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "tbz x27, #0, 137f\n"
- "ld1 { v1.b }[2], [x26]\n"
- "ld1 { v2.b }[2], [x25]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "b 137f\n"
- "136:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x26, #0x0]\n"
- "ldr b2, [x25, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
- "137:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q26, [x10, #0x0]\n"
- "ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v1.2d, v2.2d\n"
- "trn1 v27.2d, v3.2d, v4.2d\n"
- ".inst 0x4e9aaf88 // usmmla v8.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf70 // usmmla v16.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x20]\n"
- ".inst 0x4e99af8c // usmmla v12.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af74 // usmmla v20.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x30]\n"
- ".inst 0x4e9aaf89 // usmmla v9.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf71 // usmmla v17.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x40]\n"
- ".inst 0x4e99af8d // usmmla v13.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af75 // usmmla v21.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x50]\n"
- ".inst 0x4e9aaf8a // usmmla v10.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf72 // usmmla v18.4s, v27.16b, v26.16b\n"
- "ldr q26, [x10, #0x60]\n"
- ".inst 0x4e99af8e // usmmla v14.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af76 // usmmla v22.4s, v27.16b, v25.16b\n"
- "ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e9aaf8b // usmmla v11.4s, v28.16b, v26.16b\n"
- ".inst 0x4e9aaf73 // usmmla v19.4s, v27.16b, v26.16b\n"
- ".inst 0x4e99af8f // usmmla v15.4s, v28.16b, v25.16b\n"
- ".inst 0x4e99af77 // usmmla v23.4s, v27.16b, v25.16b\n"
- "138:" // Height 4: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 126b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
- "uzp2 v8.2d, v8.2d, v12.2d\n"
- "uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "uzp1 v13.2d, v10.2d, v14.2d\n"
- "uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "uzp1 v15.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "uzp1 v20.2d, v17.2d, v21.2d\n"
- "uzp2 v17.2d, v17.2d, v21.2d\n"
- "uzp1 v21.2d, v18.2d, v22.2d\n"
- "uzp2 v18.2d, v18.2d, v22.2d\n"
- "uzp1 v22.2d, v19.2d, v23.2d\n"
- "uzp2 v19.2d, v19.2d, v23.2d\n"
- "bge 147f\n"
- "tbz x11, #3, 142f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v16.4s }, [x22], #0x10\n"
- "st1 { v17.4s }, [x22], #0x10\n"
- "tbz x11, #2, 140f\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v18.4s }, [x22], #0x10\n"
- "tbz x11, #1, 139f\n"
- "str d14, [x9], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
- "str d19, [x22], #0x8\n"
- "tbz x11, #0, 146f\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
- "st1 { v19.s }[2], [x22]\n"
- "b 146f\n"
- "139:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x11, #0, 146f\n"
- "str s14, [x9, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
- "str s19, [x22, #0x0]\n"
- "b 146f\n"
- "140:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x11, #1, 141f\n"
- "str d13, [x9], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
- "str d18, [x22], #0x8\n"
- "tbz x11, #0, 146f\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
- "st1 { v18.s }[2], [x22]\n"
- "b 146f\n"
- "141:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x11, #0, 146f\n"
- "str s13, [x9, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
- "str s18, [x22, #0x0]\n"
- "b 146f\n"
- "142:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x11, #2, 144f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x23], #0x10\n"
- "st1 { v16.4s }, [x22], #0x10\n"
- "tbz x11, #1, 143f\n"
- "str d12, [x9], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
- "str d17, [x22], #0x8\n"
- "tbz x11, #0, 146f\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
- "st1 { v17.s }[2], [x22]\n"
- "b 146f\n"
- "143:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x11, #0, 146f\n"
- "str s12, [x9, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
- "str s17, [x22, #0x0]\n"
- "b 146f\n"
- "144:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x11, #1, 145f\n"
- "str d7, [x9], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d15, [x23], #0x8\n"
- "str d16, [x22], #0x8\n"
- "tbz x11, #0, 146f\n"
- "st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v15.s }[2], [x23]\n"
- "st1 { v16.s }[2], [x22]\n"
- "b 146f\n"
- "145:" // Height 4: Partial direct writeback: partial_1_0
- "str s7, [x9, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s15, [x23, #0x0]\n"
- "str s16, [x22, #0x0]\n"
- "146:" // Height 4: Partial direct writeback: Done
- "b 148f\n"
- "147:" // Height 4: Full writeback
- "str q7, [x9, #0x0]\n"
- "str q12, [x9, #0x10]\n"
- "str q13, [x9, #0x20]\n"
- "str q14, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q15, [x23, #0x0]\n"
- "str q20, [x23, #0x10]\n"
- "str q21, [x23, #0x20]\n"
- "str q22, [x23, #0x30]\n"
- "str q16, [x22, #0x0]\n"
- "str q17, [x22, #0x10]\n"
- "str q18, [x22, #0x20]\n"
- "str q19, [x22, #0x30]\n"
- "148:" // Height 4: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 113b\n"
- "b 224f\n"
- "149:" // Height 5
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "150:" // Height 5: Column loop
- "tbz %x[flags], #0, 161f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "bge 159f\n"
- "tbz x11, #3, 154f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v25.4s }, [x21], #0x10\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "ld1 { v21.4s }, [x22], #0x10\n"
- "ld1 { v26.4s }, [x21], #0x10\n"
- "tbz x11, #2, 152f\n"
- "ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v19.4s }, [x23], #0x10\n"
- "ld1 { v22.4s }, [x22], #0x10\n"
- "ld1 { v27.4s }, [x21], #0x10\n"
- "tbz x11, #1, 151f\n"
- "ldr d16, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d6, [x21], #0x8\n"
- "tbz x11, #0, 158f\n"
- "ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "ld1 { v6.s }[2], [x21]\n"
- "b 158f\n"
- "151:" // Height 5: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 158f\n"
- "ldr s16, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "ldr s6, [x21, #0x0]\n"
- "b 158f\n"
- "152:" // Height 5: Partial accumulate: partial_2_8
- "tbz x11, #1, 153f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d19, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "ldr d27, [x21], #0x8\n"
- "tbz x11, #0, 158f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "ld1 { v22.s }[2], [x22]\n"
- "ld1 { v27.s }[2], [x21]\n"
- "b 158f\n"
- "153:" // Height 5: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 158f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "ldr s22, [x22, #0x0]\n"
- "ldr s27, [x21, #0x0]\n"
- "b 158f\n"
- "154:" // Height 5: Partial accumulate: partial_4_0
- "tbz x11, #2, 156f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v25.4s }, [x21], #0x10\n"
- "tbz x11, #1, 155f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
- "tbz x11, #0, 158f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "ld1 { v26.s }[2], [x21]\n"
- "b 158f\n"
- "155:" // Height 5: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 158f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "ldr s26, [x21, #0x0]\n"
- "b 158f\n"
- "156:" // Height 5: Partial accumulate: partial_2_0
- "tbz x11, #1, 157f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d25, [x21], #0x8\n"
- "tbz x11, #0, 158f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "ld1 { v25.s }[2], [x21]\n"
- "b 158f\n"
- "157:" // Height 5: Partial accumulate: partial_1_0
- "ldr s9, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "ldr s25, [x21, #0x0]\n"
- "158:" // Height 5: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 160f\n"
- "159:" // Height 5: full accumulate
- "ldr q9, [x9, #0x0]\n"
- "ldr q10, [x9, #0x10]\n"
- "ldr q11, [x9, #0x20]\n"
- "ldr q16, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q17, [x23, #0x0]\n"
- "ldr q18, [x23, #0x10]\n"
- "ldr q19, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q20, [x22, #0x0]\n"
- "ldr q21, [x22, #0x10]\n"
- "ldr q22, [x22, #0x20]\n"
- "ldr q23, [x22, #0x30]\n"
- "ldr q25, [x21, #0x0]\n"
- "ldr q26, [x21, #0x10]\n"
- "ldr q27, [x21, #0x20]\n"
- "ldr q6, [x21, #0x30]\n"
- "160:" // Height 5: MMLA fixup
- "zip1 v8.2d, v9.2d, v12.2d\n"
- "zip2 v12.2d, v9.2d, v12.2d\n"
- "zip1 v9.2d, v10.2d, v13.2d\n"
- "zip2 v13.2d, v10.2d, v13.2d\n"
- "zip1 v10.2d, v11.2d, v14.2d\n"
- "zip2 v14.2d, v11.2d, v14.2d\n"
- "zip1 v11.2d, v16.2d, v15.2d\n"
- "zip2 v15.2d, v16.2d, v15.2d\n"
- "zip1 v16.2d, v17.2d, v20.2d\n"
- "zip2 v20.2d, v17.2d, v20.2d\n"
- "zip1 v17.2d, v18.2d, v21.2d\n"
- "zip2 v21.2d, v18.2d, v21.2d\n"
- "zip1 v18.2d, v19.2d, v22.2d\n"
- "zip2 v22.2d, v19.2d, v22.2d\n"
- "zip1 v19.2d, v24.2d, v23.2d\n"
- "zip2 v23.2d, v24.2d, v23.2d\n"
- "zip1 v24.2d, v25.2d, v28.2d\n"
- "zip2 v28.2d, v25.2d, v28.2d\n"
- "zip1 v25.2d, v26.2d, v29.2d\n"
- "zip2 v29.2d, v26.2d, v29.2d\n"
- "zip1 v26.2d, v27.2d, v30.2d\n"
- "zip2 v30.2d, v27.2d, v30.2d\n"
- "zip1 v27.2d, v6.2d, v31.2d\n"
- "zip2 v31.2d, v6.2d, v31.2d\n"
- "b 162f\n"
- "161:" // Height 5: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "162:" // Height 5: setup done
- "mov x28, #0x0\n"
- "163:" // Height 5: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 164f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x28, 165f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 165f\n"
- "164:" // Height 5: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "165:" // Height 5: input setup done
- "cmp x27, #0x10\n"
- "blt 168f\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x22, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "blt 167f\n"
- "166:" // Height 5: Multiply loop: Main loop head
- "trn1 v6.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v4.2d, v5.2d, v0.2d\n"
- "trn2 v5.2d, v5.2d, v0.2d\n"
- "ldr q0, [x10, #0x10]\n"
- ".inst 0x4e87acc8 // usmmla v8.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac50 // usmmla v16.4s, v2.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4e87ac98 // usmmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e80accc // usmmla v12.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac54 // usmmla v20.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9c // usmmla v28.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
- ".inst 0x4e87acc9 // usmmla v9.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac51 // usmmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac99 // usmmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e80accd // usmmla v13.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac55 // usmmla v21.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9d // usmmla v29.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
- ".inst 0x4e87acca // usmmla v10.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac52 // usmmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9a // usmmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e80acce // usmmla v14.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac56 // usmmla v22.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9e // usmmla v30.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- ".inst 0x4e87accb // usmmla v11.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac53 // usmmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9b // usmmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e80accf // usmmla v15.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac57 // usmmla v23.4s, v2.16b, v0.16b\n"
- "ldr q2, [x25, #0x0]\n"
- ".inst 0x4e80ac9f // usmmla v31.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x90]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87ac28 // usmmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac70 // usmmla v16.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87acb8 // usmmla v24.4s, v5.16b, v7.16b\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4e80ac2c // usmmla v12.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac74 // usmmla v20.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbc // usmmla v28.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xb0]\n"
- ".inst 0x4e86ac29 // usmmla v9.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac71 // usmmla v17.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acb9 // usmmla v25.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4e80ac2d // usmmla v13.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac75 // usmmla v21.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbd // usmmla v29.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xd0]\n"
- ".inst 0x4e86ac2a // usmmla v10.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac72 // usmmla v18.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acba // usmmla v26.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4e80ac2e // usmmla v14.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac76 // usmmla v22.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbe // usmmla v30.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e86ac2b // usmmla v11.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac73 // usmmla v19.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acbb // usmmla v27.4s, v5.16b, v6.16b\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x4e80ac2f // usmmla v15.4s, v1.16b, v0.16b\n"
- "ldr q1, [x26, #0x0]\n"
- ".inst 0x4e80ac77 // usmmla v23.4s, v3.16b, v0.16b\n"
- "ldr q3, [x24, #0x0]\n"
- ".inst 0x4e80acbf // usmmla v31.4s, v5.16b, v0.16b\n"
- "ldr q5, [x22, #0x0]\n"
- "bge 166b\n"
- "167:" // Height 5: Multiply loop: Single iteration only
- "trn1 v6.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v4.2d, v5.2d, v0.2d\n"
- "trn2 v5.2d, v5.2d, v0.2d\n"
- "ldr q0, [x10, #0x10]\n"
- ".inst 0x4e87acc8 // usmmla v8.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac50 // usmmla v16.4s, v2.16b, v7.16b\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4e87ac98 // usmmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e80accc // usmmla v12.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac54 // usmmla v20.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9c // usmmla v28.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
- ".inst 0x4e87acc9 // usmmla v9.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac51 // usmmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac99 // usmmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e80accd // usmmla v13.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac55 // usmmla v21.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9d // usmmla v29.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
- ".inst 0x4e87acca // usmmla v10.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac52 // usmmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9a // usmmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e80acce // usmmla v14.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac56 // usmmla v22.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9e // usmmla v30.4s, v4.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- ".inst 0x4e87accb // usmmla v11.4s, v6.16b, v7.16b\n"
- ".inst 0x4e87ac53 // usmmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9b // usmmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e80accf // usmmla v15.4s, v6.16b, v0.16b\n"
- ".inst 0x4e80ac57 // usmmla v23.4s, v2.16b, v0.16b\n"
- ".inst 0x4e80ac9f // usmmla v31.4s, v4.16b, v0.16b\n"
- "ldr q2, [x10, #0x90]\n"
- ".inst 0x4e87ac28 // usmmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac70 // usmmla v16.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87acb8 // usmmla v24.4s, v5.16b, v7.16b\n"
- "ldr q0, [x10, #0xa0]\n"
- ".inst 0x4e82ac2c // usmmla v12.4s, v1.16b, v2.16b\n"
- ".inst 0x4e82ac74 // usmmla v20.4s, v3.16b, v2.16b\n"
- ".inst 0x4e82acbc // usmmla v28.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xb0]\n"
- ".inst 0x4e80ac29 // usmmla v9.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac71 // usmmla v17.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acb9 // usmmla v25.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xc0]\n"
- ".inst 0x4e82ac2d // usmmla v13.4s, v1.16b, v2.16b\n"
- ".inst 0x4e82ac75 // usmmla v21.4s, v3.16b, v2.16b\n"
- ".inst 0x4e82acbd // usmmla v29.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xd0]\n"
- ".inst 0x4e80ac2a // usmmla v10.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac72 // usmmla v18.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acba // usmmla v26.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xe0]\n"
- ".inst 0x4e82ac2e // usmmla v14.4s, v1.16b, v2.16b\n"
- ".inst 0x4e82ac76 // usmmla v22.4s, v3.16b, v2.16b\n"
- ".inst 0x4e82acbe // usmmla v30.4s, v5.16b, v2.16b\n"
- "ldr q6, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e80ac2b // usmmla v11.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac73 // usmmla v19.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbb // usmmla v27.4s, v5.16b, v0.16b\n"
- ".inst 0x4e86ac2f // usmmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac77 // usmmla v23.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acbf // usmmla v31.4s, v5.16b, v6.16b\n"
- "168:" // Height 5: Multiply loop: Main loop skip
- "cbz x27, 175f\n"
- "cmp x27, #0x8\n"
- "blt 170f\n"
- "169:" // Height 5: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "cmp x27, #0x8\n"
- "ldr d0, [x22], #0x8\n"
- "ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v3.2d, v2.2d\n"
- "trn1 v2.2d, v0.2d, v5.2d\n"
- "ldr q0, [x10, #0x10]\n"
- ".inst 0x4e81ac88 // usmmla v8.4s, v4.16b, v1.16b\n"
- ".inst 0x4e81ac70 // usmmla v16.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x20]\n"
- ".inst 0x4e80ac8c // usmmla v12.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac74 // usmmla v20.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5c // usmmla v28.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
- ".inst 0x4e81ac89 // usmmla v9.4s, v4.16b, v1.16b\n"
- ".inst 0x4e81ac71 // usmmla v17.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac59 // usmmla v25.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x40]\n"
- ".inst 0x4e80ac8d // usmmla v13.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac75 // usmmla v21.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5d // usmmla v29.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
- ".inst 0x4e81ac8a // usmmla v10.4s, v4.16b, v1.16b\n"
- ".inst 0x4e81ac72 // usmmla v18.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5a // usmmla v26.4s, v2.16b, v1.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4e80ac8e // usmmla v14.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac76 // usmmla v22.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5e // usmmla v30.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e86ac8b // usmmla v11.4s, v4.16b, v6.16b\n"
- ".inst 0x4e86ac73 // usmmla v19.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86ac5b // usmmla v27.4s, v2.16b, v6.16b\n"
- ".inst 0x4e80ac8f // usmmla v15.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac77 // usmmla v23.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5f // usmmla v31.4s, v2.16b, v0.16b\n"
- "bge 169b\n"
- "170:" // Height 5: Multiply loop: Skip odd blocks
- "cbz x27, 175f\n"
- "tbz x27, #2, 172f\n"
- "ldr s1, [x26], #0x4\n"
- "ldr s2, [x25], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s5, [x22], #0x4\n"
- "tbz x27, #1, 171f\n"
- "ld1 { v1.h }[2], [x26], #0x2\n"
- "ld1 { v2.h }[2], [x25], #0x2\n"
- "ld1 { v3.h }[2], [x24], #0x2\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v5.h }[2], [x22], #0x2\n"
- "tbz x27, #0, 174f\n"
- "ld1 { v1.b }[6], [x26]\n"
- "ld1 { v2.b }[6], [x25]\n"
- "ld1 { v3.b }[6], [x24]\n"
- "ld1 { v4.b }[6], [x23]\n"
- "ld1 { v5.b }[6], [x22]\n"
- "b 174f\n"
- "171:" // Height 5: Multiply loop: Ragged operand read: partial_1_4
- "tbz x27, #0, 174f\n"
- "ld1 { v1.b }[4], [x26]\n"
- "ld1 { v2.b }[4], [x25]\n"
- "ld1 { v3.b }[4], [x24]\n"
- "ld1 { v4.b }[4], [x23]\n"
- "ld1 { v5.b }[4], [x22]\n"
- "b 174f\n"
- "172:" // Height 5: Multiply loop: Ragged operand read: partial_2_0
- "tbz x27, #1, 173f\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h2, [x25], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h5, [x22], #0x2\n"
- "tbz x27, #0, 174f\n"
- "ld1 { v1.b }[2], [x26]\n"
- "ld1 { v2.b }[2], [x25]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v5.b }[2], [x22]\n"
- "b 174f\n"
- "173:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x26, #0x0]\n"
- "ldr b2, [x25, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
- "ldr b5, [x22, #0x0]\n"
- "174:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "trn1 v7.2d, v1.2d, v2.2d\n"
- "trn1 v3.2d, v3.2d, v4.2d\n"
- "trn1 v2.2d, v5.2d, v0.2d\n"
- "ldr q1, [x10, #0x10]\n"
- ".inst 0x4e86ace8 // usmmla v8.4s, v7.16b, v6.16b\n"
- ".inst 0x4e86ac70 // usmmla v16.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86ac58 // usmmla v24.4s, v2.16b, v6.16b\n"
- "ldr q0, [x10, #0x20]\n"
- ".inst 0x4e81acec // usmmla v12.4s, v7.16b, v1.16b\n"
- ".inst 0x4e81ac74 // usmmla v20.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5c // usmmla v28.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x30]\n"
- ".inst 0x4e80ace9 // usmmla v9.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac71 // usmmla v17.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac59 // usmmla v25.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x40]\n"
- ".inst 0x4e81aced // usmmla v13.4s, v7.16b, v1.16b\n"
- ".inst 0x4e81ac75 // usmmla v21.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5d // usmmla v29.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x50]\n"
- ".inst 0x4e80acea // usmmla v10.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac72 // usmmla v18.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5a // usmmla v26.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x60]\n"
- ".inst 0x4e81acee // usmmla v14.4s, v7.16b, v1.16b\n"
- ".inst 0x4e81ac76 // usmmla v22.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5e // usmmla v30.4s, v2.16b, v1.16b\n"
- "ldr q6, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e80aceb // usmmla v11.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac73 // usmmla v19.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5b // usmmla v27.4s, v2.16b, v0.16b\n"
- ".inst 0x4e86acef // usmmla v15.4s, v7.16b, v6.16b\n"
- ".inst 0x4e86ac77 // usmmla v23.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86ac5f // usmmla v31.4s, v2.16b, v6.16b\n"
- "175:" // Height 5: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 163b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
- "uzp2 v8.2d, v8.2d, v12.2d\n"
- "uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "uzp1 v13.2d, v10.2d, v14.2d\n"
- "uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
- "uzp1 v15.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "uzp1 v20.2d, v17.2d, v21.2d\n"
- "uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "uzp1 v21.2d, v18.2d, v22.2d\n"
- "uzp2 v18.2d, v18.2d, v22.2d\n"
- "uzp1 v22.2d, v19.2d, v23.2d\n"
- "uzp2 v19.2d, v19.2d, v23.2d\n"
- "uzp1 v24.2d, v24.2d, v28.2d\n"
- "uzp1 v25.2d, v25.2d, v29.2d\n"
- "uzp1 v26.2d, v26.2d, v30.2d\n"
- "uzp1 v27.2d, v27.2d, v31.2d\n"
- "bge 184f\n"
- "tbz x11, #3, 179f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v16.4s }, [x22], #0x10\n"
- "st1 { v17.4s }, [x22], #0x10\n"
- "st1 { v24.4s }, [x21], #0x10\n"
- "st1 { v25.4s }, [x21], #0x10\n"
- "tbz x11, #2, 177f\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v18.4s }, [x22], #0x10\n"
- "st1 { v26.4s }, [x21], #0x10\n"
- "tbz x11, #1, 176f\n"
- "str d14, [x9], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
- "str d19, [x22], #0x8\n"
- "str d27, [x21], #0x8\n"
- "tbz x11, #0, 183f\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
- "st1 { v19.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x21]\n"
- "b 183f\n"
- "176:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x11, #0, 183f\n"
- "str s14, [x9, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
- "str s19, [x22, #0x0]\n"
- "str s27, [x21, #0x0]\n"
- "b 183f\n"
- "177:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x11, #1, 178f\n"
- "str d13, [x9], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
- "str d18, [x22], #0x8\n"
- "str d26, [x21], #0x8\n"
- "tbz x11, #0, 183f\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
- "st1 { v18.s }[2], [x22]\n"
- "st1 { v26.s }[2], [x21]\n"
- "b 183f\n"
- "178:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x11, #0, 183f\n"
- "str s13, [x9, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
- "str s18, [x22, #0x0]\n"
- "str s26, [x21, #0x0]\n"
- "b 183f\n"
- "179:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x11, #2, 181f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x23], #0x10\n"
- "st1 { v16.4s }, [x22], #0x10\n"
- "st1 { v24.4s }, [x21], #0x10\n"
- "tbz x11, #1, 180f\n"
- "str d12, [x9], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
- "str d17, [x22], #0x8\n"
- "str d25, [x21], #0x8\n"
- "tbz x11, #0, 183f\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
- "st1 { v17.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "b 183f\n"
- "180:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x11, #0, 183f\n"
- "str s12, [x9, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
- "str s17, [x22, #0x0]\n"
- "str s25, [x21, #0x0]\n"
- "b 183f\n"
- "181:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x11, #1, 182f\n"
- "str d7, [x9], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d15, [x23], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "tbz x11, #0, 183f\n"
- "st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v15.s }[2], [x23]\n"
- "st1 { v16.s }[2], [x22]\n"
- "st1 { v24.s }[2], [x21]\n"
- "b 183f\n"
- "182:" // Height 5: Partial direct writeback: partial_1_0
- "str s7, [x9, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s15, [x23, #0x0]\n"
- "str s16, [x22, #0x0]\n"
- "str s24, [x21, #0x0]\n"
- "183:" // Height 5: Partial direct writeback: Done
- "b 185f\n"
- "184:" // Height 5: Full writeback
- "str q7, [x9, #0x0]\n"
- "str q12, [x9, #0x10]\n"
- "str q13, [x9, #0x20]\n"
- "str q14, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q15, [x23, #0x0]\n"
- "str q20, [x23, #0x10]\n"
- "str q21, [x23, #0x20]\n"
- "str q22, [x23, #0x30]\n"
- "str q16, [x22, #0x0]\n"
- "str q17, [x22, #0x10]\n"
- "str q18, [x22, #0x20]\n"
- "str q19, [x22, #0x30]\n"
- "str q24, [x21, #0x0]\n"
- "str q25, [x21, #0x10]\n"
- "str q26, [x21, #0x20]\n"
- "str q27, [x21, #0x30]\n"
- "185:" // Height 5: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 150b\n"
- "b 224f\n"
- "186:" // Height 6
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x18\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "187:" // Height 6: Column loop
- "tbz %x[flags], #0, 198f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "bge 196f\n"
- "tbz x11, #3, 191f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v25.4s }, [x21], #0x10\n"
- "ld1 { v28.4s }, [x20], #0x10\n"
- "ld1 { v10.4s }, [x9], #0x10\n"
- "ld1 { v13.4s }, [x24], #0x10\n"
- "ld1 { v18.4s }, [x23], #0x10\n"
- "ld1 { v21.4s }, [x22], #0x10\n"
- "ld1 { v26.4s }, [x21], #0x10\n"
- "ld1 { v29.4s }, [x20], #0x10\n"
- "tbz x11, #2, 189f\n"
- "ld1 { v11.4s }, [x9], #0x10\n"
- "ld1 { v14.4s }, [x24], #0x10\n"
- "ld1 { v19.4s }, [x23], #0x10\n"
- "ld1 { v22.4s }, [x22], #0x10\n"
- "ld1 { v27.4s }, [x21], #0x10\n"
- "ld1 { v30.4s }, [x20], #0x10\n"
- "tbz x11, #1, 188f\n"
- "ldr d16, [x9], #0x8\n"
- "ldr d15, [x24], #0x8\n"
- "mov x25, #0x38\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d23, [x22], #0x8\n"
- "ldr d6, [x21], #0x8\n"
- "ldr d31, [x20], #0x8\n"
- "tbz x11, #0, 195f\n"
- "ld1 { v16.s }[2], [x9]\n"
- "ld1 { v15.s }[2], [x24]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v23.s }[2], [x22]\n"
- "ld1 { v6.s }[2], [x21]\n"
- "ld1 { v31.s }[2], [x20]\n"
- "b 195f\n"
- "188:" // Height 6: Partial accumulate: partial_1_12
- "mov x25, #0x30\n"
- "tbz x11, #0, 195f\n"
- "ldr s16, [x9, #0x0]\n"
- "ldr s15, [x24, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s23, [x22, #0x0]\n"
- "ldr s6, [x21, #0x0]\n"
- "ldr s31, [x20, #0x0]\n"
- "b 195f\n"
- "189:" // Height 6: Partial accumulate: partial_2_8
- "tbz x11, #1, 190f\n"
- "ldr d11, [x9], #0x8\n"
- "ldr d14, [x24], #0x8\n"
- "mov x25, #0x28\n"
- "ldr d19, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "ldr d27, [x21], #0x8\n"
- "ldr d30, [x20], #0x8\n"
- "tbz x11, #0, 195f\n"
- "ld1 { v11.s }[2], [x9]\n"
- "ld1 { v14.s }[2], [x24]\n"
- "ld1 { v19.s }[2], [x23]\n"
- "ld1 { v22.s }[2], [x22]\n"
- "ld1 { v27.s }[2], [x21]\n"
- "ld1 { v30.s }[2], [x20]\n"
- "b 195f\n"
- "190:" // Height 6: Partial accumulate: partial_1_8
- "mov x25, #0x20\n"
- "tbz x11, #0, 195f\n"
- "ldr s11, [x9, #0x0]\n"
- "ldr s14, [x24, #0x0]\n"
- "ldr s19, [x23, #0x0]\n"
- "ldr s22, [x22, #0x0]\n"
- "ldr s27, [x21, #0x0]\n"
- "ldr s30, [x20, #0x0]\n"
- "b 195f\n"
- "191:" // Height 6: Partial accumulate: partial_4_0
- "tbz x11, #2, 193f\n"
- "ld1 { v9.4s }, [x9], #0x10\n"
- "ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v17.4s }, [x23], #0x10\n"
- "ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v25.4s }, [x21], #0x10\n"
- "ld1 { v28.4s }, [x20], #0x10\n"
- "tbz x11, #1, 192f\n"
- "ldr d10, [x9], #0x8\n"
- "ldr d13, [x24], #0x8\n"
- "mov x25, #0x18\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
- "ldr d29, [x20], #0x8\n"
- "tbz x11, #0, 195f\n"
- "ld1 { v10.s }[2], [x9]\n"
- "ld1 { v13.s }[2], [x24]\n"
- "ld1 { v18.s }[2], [x23]\n"
- "ld1 { v21.s }[2], [x22]\n"
- "ld1 { v26.s }[2], [x21]\n"
- "ld1 { v29.s }[2], [x20]\n"
- "b 195f\n"
- "192:" // Height 6: Partial accumulate: partial_1_4
- "mov x25, #0x10\n"
- "tbz x11, #0, 195f\n"
- "ldr s10, [x9, #0x0]\n"
- "ldr s13, [x24, #0x0]\n"
- "ldr s18, [x23, #0x0]\n"
- "ldr s21, [x22, #0x0]\n"
- "ldr s26, [x21, #0x0]\n"
- "ldr s29, [x20, #0x0]\n"
- "b 195f\n"
- "193:" // Height 6: Partial accumulate: partial_2_0
- "tbz x11, #1, 194f\n"
- "ldr d9, [x9], #0x8\n"
- "ldr d12, [x24], #0x8\n"
- "mov x25, #0x8\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d25, [x21], #0x8\n"
- "ldr d28, [x20], #0x8\n"
- "tbz x11, #0, 195f\n"
- "ld1 { v9.s }[2], [x9]\n"
- "ld1 { v12.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "ld1 { v25.s }[2], [x21]\n"
- "ld1 { v28.s }[2], [x20]\n"
- "b 195f\n"
- "194:" // Height 6: Partial accumulate: partial_1_0
- "ldr s9, [x9, #0x0]\n"
- "ldr s12, [x24, #0x0]\n"
- "mov x25, #0x0\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "ldr s25, [x21, #0x0]\n"
- "ldr s28, [x20, #0x0]\n"
- "195:" // Height 6: Partial accumulate: Done
- "sub x9, x9, x25\n"
- "b 197f\n"
- "196:" // Height 6: full accumulate
- "ldr q9, [x9, #0x0]\n"
- "ldr q10, [x9, #0x10]\n"
- "ldr q11, [x9, #0x20]\n"
- "ldr q16, [x9, #0x30]\n"
- "ldr q12, [x24, #0x0]\n"
- "ldr q13, [x24, #0x10]\n"
- "ldr q14, [x24, #0x20]\n"
- "ldr q15, [x24, #0x30]\n"
- "ldr q17, [x23, #0x0]\n"
- "ldr q18, [x23, #0x10]\n"
- "ldr q19, [x23, #0x20]\n"
- "ldr q24, [x23, #0x30]\n"
- "ldr q20, [x22, #0x0]\n"
- "ldr q21, [x22, #0x10]\n"
- "ldr q22, [x22, #0x20]\n"
- "ldr q23, [x22, #0x30]\n"
- "ldr q25, [x21, #0x0]\n"
- "ldr q26, [x21, #0x10]\n"
- "ldr q27, [x21, #0x20]\n"
- "ldr q6, [x21, #0x30]\n"
- "ldr q28, [x20, #0x0]\n"
- "ldr q29, [x20, #0x10]\n"
- "ldr q30, [x20, #0x20]\n"
- "ldr q31, [x20, #0x30]\n"
- "197:" // Height 6: MMLA fixup
- "zip1 v8.2d, v9.2d, v12.2d\n"
- "zip2 v12.2d, v9.2d, v12.2d\n"
- "zip1 v9.2d, v10.2d, v13.2d\n"
- "zip2 v13.2d, v10.2d, v13.2d\n"
- "zip1 v10.2d, v11.2d, v14.2d\n"
- "zip2 v14.2d, v11.2d, v14.2d\n"
- "zip1 v11.2d, v16.2d, v15.2d\n"
- "zip2 v15.2d, v16.2d, v15.2d\n"
- "zip1 v16.2d, v17.2d, v20.2d\n"
- "zip2 v20.2d, v17.2d, v20.2d\n"
- "zip1 v17.2d, v18.2d, v21.2d\n"
- "zip2 v21.2d, v18.2d, v21.2d\n"
- "zip1 v18.2d, v19.2d, v22.2d\n"
- "zip2 v22.2d, v19.2d, v22.2d\n"
- "zip1 v19.2d, v24.2d, v23.2d\n"
- "zip2 v23.2d, v24.2d, v23.2d\n"
- "zip1 v24.2d, v25.2d, v28.2d\n"
- "zip2 v28.2d, v25.2d, v28.2d\n"
- "zip1 v25.2d, v26.2d, v29.2d\n"
- "zip2 v29.2d, v26.2d, v29.2d\n"
- "zip1 v26.2d, v27.2d, v30.2d\n"
- "zip2 v30.2d, v27.2d, v30.2d\n"
- "zip1 v27.2d, v6.2d, v31.2d\n"
- "zip2 v31.2d, v6.2d, v31.2d\n"
- "b 199f\n"
- "198:" // Height 6: no accumulate
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "199:" // Height 6: setup done
- "mov x28, #0x0\n"
- "200:" // Height 6: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 201f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "ldr x21, [x20, #0x28]\n"
- "cbnz x28, 202f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 202f\n"
- "201:" // Height 6: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "202:" // Height 6: input setup done
- "cmp x27, #0x10\n"
- "blt 205f\n"
- "ldr q1, [x26, #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
- "cmp x27, #0x20\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x22, #0x0]\n"
- "ldr q6, [x21, #0x0]\n"
- "ldr q7, [x10, #0x0]\n"
- "blt 204f\n"
- "203:" // Height 6: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4e87ac08 // usmmla v8.4s, v0.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e87ac50 // usmmla v16.4s, v2.16b, v7.16b\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e87ac98 // usmmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86ac0c // usmmla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e86ac54 // usmmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9c // usmmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- ".inst 0x4e87ac09 // usmmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac51 // usmmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac99 // usmmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86ac0d // usmmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac55 // usmmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9d // usmmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87ac0a // usmmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac52 // usmmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9a // usmmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86ac0e // usmmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac56 // usmmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9e // usmmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87ac0b // usmmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac53 // usmmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9b // usmmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86ac0f // usmmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac57 // usmmla v23.4s, v2.16b, v6.16b\n"
- "ldr q2, [x25, #0x0]\n"
- ".inst 0x4e86ac9f // usmmla v31.4s, v4.16b, v6.16b\n"
- "ldr q0, [x10, #0x90]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87ac28 // usmmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac70 // usmmla v16.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87acb8 // usmmla v24.4s, v5.16b, v7.16b\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4e80ac2c // usmmla v12.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac74 // usmmla v20.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbc // usmmla v28.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xb0]\n"
- ".inst 0x4e86ac29 // usmmla v9.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac71 // usmmla v17.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acb9 // usmmla v25.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4e80ac2d // usmmla v13.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac75 // usmmla v21.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbd // usmmla v29.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xd0]\n"
- ".inst 0x4e86ac2a // usmmla v10.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac72 // usmmla v18.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acba // usmmla v26.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4e80ac2e // usmmla v14.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac76 // usmmla v22.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbe // usmmla v30.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e86ac2b // usmmla v11.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac73 // usmmla v19.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acbb // usmmla v27.4s, v5.16b, v6.16b\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x4e80ac2f // usmmla v15.4s, v1.16b, v0.16b\n"
- "ldr q1, [x26, #0x0]\n"
- ".inst 0x4e80ac77 // usmmla v23.4s, v3.16b, v0.16b\n"
- "ldr q3, [x24, #0x0]\n"
- ".inst 0x4e80acbf // usmmla v31.4s, v5.16b, v0.16b\n"
- "ldr q5, [x22, #0x0]\n"
- "ldr q6, [x21, #0x0]\n"
- "bge 203b\n"
- "204:" // Height 6: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4e87ac08 // usmmla v8.4s, v0.16b, v7.16b\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87ac50 // usmmla v16.4s, v2.16b, v7.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e87ac98 // usmmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86ac0c // usmmla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4e86ac54 // usmmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9c // usmmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- ".inst 0x4e87ac09 // usmmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac51 // usmmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac99 // usmmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86ac0d // usmmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac55 // usmmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9d // usmmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87ac0a // usmmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac52 // usmmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9a // usmmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86ac0e // usmmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac56 // usmmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9e // usmmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87ac0b // usmmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87ac53 // usmmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87ac9b // usmmla v27.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86ac0f // usmmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86ac57 // usmmla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86ac9f // usmmla v31.4s, v4.16b, v6.16b\n"
- "ldr q2, [x10, #0x90]\n"
- ".inst 0x4e87ac28 // usmmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87ac70 // usmmla v16.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87acb8 // usmmla v24.4s, v5.16b, v7.16b\n"
- "ldr q0, [x10, #0xa0]\n"
- ".inst 0x4e82ac2c // usmmla v12.4s, v1.16b, v2.16b\n"
- ".inst 0x4e82ac74 // usmmla v20.4s, v3.16b, v2.16b\n"
- ".inst 0x4e82acbc // usmmla v28.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xb0]\n"
- ".inst 0x4e80ac29 // usmmla v9.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac71 // usmmla v17.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acb9 // usmmla v25.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xc0]\n"
- ".inst 0x4e82ac2d // usmmla v13.4s, v1.16b, v2.16b\n"
- ".inst 0x4e82ac75 // usmmla v21.4s, v3.16b, v2.16b\n"
- ".inst 0x4e82acbd // usmmla v29.4s, v5.16b, v2.16b\n"
- "ldr q2, [x10, #0xd0]\n"
- ".inst 0x4e80ac2a // usmmla v10.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac72 // usmmla v18.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acba // usmmla v26.4s, v5.16b, v0.16b\n"
- "ldr q0, [x10, #0xe0]\n"
- ".inst 0x4e82ac2e // usmmla v14.4s, v1.16b, v2.16b\n"
- ".inst 0x4e82ac76 // usmmla v22.4s, v3.16b, v2.16b\n"
- ".inst 0x4e82acbe // usmmla v30.4s, v5.16b, v2.16b\n"
- "ldr q6, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
- ".inst 0x4e80ac2b // usmmla v11.4s, v1.16b, v0.16b\n"
- ".inst 0x4e80ac73 // usmmla v19.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80acbb // usmmla v27.4s, v5.16b, v0.16b\n"
- ".inst 0x4e86ac2f // usmmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86ac77 // usmmla v23.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86acbf // usmmla v31.4s, v5.16b, v6.16b\n"
- "205:" // Height 6: Multiply loop: Main loop skip
- "cbz x27, 212f\n"
- "cmp x27, #0x8\n"
- "blt 207f\n"
- "206:" // Height 6: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d5, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "cmp x27, #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d0, [x21], #0x8\n"
- "ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v3.2d\n"
- "trn1 v2.2d, v2.2d, v0.2d\n"
- "ldr q0, [x10, #0x10]\n"
- ".inst 0x4e81ac88 // usmmla v8.4s, v4.16b, v1.16b\n"
- ".inst 0x4e81ac70 // usmmla v16.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x20]\n"
- ".inst 0x4e80ac8c // usmmla v12.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac74 // usmmla v20.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5c // usmmla v28.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x30]\n"
- ".inst 0x4e81ac89 // usmmla v9.4s, v4.16b, v1.16b\n"
- ".inst 0x4e81ac71 // usmmla v17.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac59 // usmmla v25.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x40]\n"
- ".inst 0x4e80ac8d // usmmla v13.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac75 // usmmla v21.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5d // usmmla v29.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x50]\n"
- ".inst 0x4e81ac8a // usmmla v10.4s, v4.16b, v1.16b\n"
- ".inst 0x4e81ac72 // usmmla v18.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5a // usmmla v26.4s, v2.16b, v1.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4e80ac8e // usmmla v14.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac76 // usmmla v22.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5e // usmmla v30.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e86ac8b // usmmla v11.4s, v4.16b, v6.16b\n"
- ".inst 0x4e86ac73 // usmmla v19.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86ac5b // usmmla v27.4s, v2.16b, v6.16b\n"
- ".inst 0x4e80ac8f // usmmla v15.4s, v4.16b, v0.16b\n"
- ".inst 0x4e80ac77 // usmmla v23.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5f // usmmla v31.4s, v2.16b, v0.16b\n"
- "bge 206b\n"
- "207:" // Height 6: Multiply loop: Skip odd blocks
- "cbz x27, 212f\n"
- "tbz x27, #2, 209f\n"
- "ldr s1, [x26], #0x4\n"
- "ldr s2, [x25], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s5, [x22], #0x4\n"
- "ldr s6, [x21], #0x4\n"
- "tbz x27, #1, 208f\n"
- "ld1 { v1.h }[2], [x26], #0x2\n"
- "ld1 { v2.h }[2], [x25], #0x2\n"
- "ld1 { v3.h }[2], [x24], #0x2\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v5.h }[2], [x22], #0x2\n"
- "ld1 { v6.h }[2], [x21], #0x2\n"
- "tbz x27, #0, 211f\n"
- "ld1 { v1.b }[6], [x26]\n"
- "ld1 { v2.b }[6], [x25]\n"
- "ld1 { v3.b }[6], [x24]\n"
- "ld1 { v4.b }[6], [x23]\n"
- "ld1 { v5.b }[6], [x22]\n"
- "ld1 { v6.b }[6], [x21]\n"
- "b 211f\n"
- "208:" // Height 6: Multiply loop: Ragged operand read: partial_1_4
- "tbz x27, #0, 211f\n"
- "ld1 { v1.b }[4], [x26]\n"
- "ld1 { v2.b }[4], [x25]\n"
- "ld1 { v3.b }[4], [x24]\n"
- "ld1 { v4.b }[4], [x23]\n"
- "ld1 { v5.b }[4], [x22]\n"
- "ld1 { v6.b }[4], [x21]\n"
- "b 211f\n"
- "209:" // Height 6: Multiply loop: Ragged operand read: partial_2_0
- "tbz x27, #1, 210f\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h2, [x25], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h5, [x22], #0x2\n"
- "ldr h6, [x21], #0x2\n"
- "tbz x27, #0, 211f\n"
- "ld1 { v1.b }[2], [x26]\n"
- "ld1 { v2.b }[2], [x25]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v5.b }[2], [x22]\n"
- "ld1 { v6.b }[2], [x21]\n"
- "b 211f\n"
- "210:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
- "ldr b1, [x26, #0x0]\n"
- "ldr b2, [x25, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
- "ldr b5, [x22, #0x0]\n"
- "ldr b6, [x21, #0x0]\n"
- "211:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q0, [x10, #0x0]\n"
- "trn1 v7.2d, v1.2d, v2.2d\n"
- "trn1 v3.2d, v3.2d, v4.2d\n"
- "trn1 v2.2d, v5.2d, v6.2d\n"
- "ldr q1, [x10, #0x10]\n"
- ".inst 0x4e80ace8 // usmmla v8.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac70 // usmmla v16.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac58 // usmmla v24.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x20]\n"
- ".inst 0x4e81acec // usmmla v12.4s, v7.16b, v1.16b\n"
- ".inst 0x4e81ac74 // usmmla v20.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5c // usmmla v28.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x30]\n"
- ".inst 0x4e80ace9 // usmmla v9.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac71 // usmmla v17.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac59 // usmmla v25.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x40]\n"
- ".inst 0x4e81aced // usmmla v13.4s, v7.16b, v1.16b\n"
- ".inst 0x4e81ac75 // usmmla v21.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5d // usmmla v29.4s, v2.16b, v1.16b\n"
- "ldr q1, [x10, #0x50]\n"
- ".inst 0x4e80acea // usmmla v10.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac72 // usmmla v18.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5a // usmmla v26.4s, v2.16b, v0.16b\n"
- "ldr q0, [x10, #0x60]\n"
- ".inst 0x4e81acee // usmmla v14.4s, v7.16b, v1.16b\n"
- ".inst 0x4e81ac76 // usmmla v22.4s, v3.16b, v1.16b\n"
- ".inst 0x4e81ac5e // usmmla v30.4s, v2.16b, v1.16b\n"
- "ldr q6, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
- ".inst 0x4e80aceb // usmmla v11.4s, v7.16b, v0.16b\n"
- ".inst 0x4e80ac73 // usmmla v19.4s, v3.16b, v0.16b\n"
- ".inst 0x4e80ac5b // usmmla v27.4s, v2.16b, v0.16b\n"
- ".inst 0x4e86acef // usmmla v15.4s, v7.16b, v6.16b\n"
- ".inst 0x4e86ac77 // usmmla v23.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86ac5f // usmmla v31.4s, v2.16b, v6.16b\n"
- "212:" // Height 6: Multiply loop: No odd multiplies
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 200b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
- "uzp2 v8.2d, v8.2d, v12.2d\n"
- "uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "uzp1 v13.2d, v10.2d, v14.2d\n"
- "uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
- "uzp1 v15.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 v20.2d, v17.2d, v21.2d\n"
- "uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "uzp1 v21.2d, v18.2d, v22.2d\n"
- "uzp2 v18.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
- "uzp1 v22.2d, v19.2d, v23.2d\n"
- "uzp2 v19.2d, v19.2d, v23.2d\n"
- "uzp1 v23.2d, v24.2d, v28.2d\n"
- "uzp2 v24.2d, v24.2d, v28.2d\n"
- "uzp1 v28.2d, v25.2d, v29.2d\n"
- "uzp2 v25.2d, v25.2d, v29.2d\n"
- "uzp1 v29.2d, v26.2d, v30.2d\n"
- "uzp2 v26.2d, v26.2d, v30.2d\n"
- "uzp1 v30.2d, v27.2d, v31.2d\n"
- "uzp2 v27.2d, v27.2d, v31.2d\n"
- "bge 221f\n"
- "tbz x11, #3, 216f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v9.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x23], #0x10\n"
- "st1 { v20.4s }, [x23], #0x10\n"
- "st1 { v16.4s }, [x22], #0x10\n"
- "st1 { v17.4s }, [x22], #0x10\n"
- "st1 { v23.4s }, [x21], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "st1 { v24.4s }, [x20], #0x10\n"
- "st1 { v25.4s }, [x20], #0x10\n"
- "tbz x11, #2, 214f\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v10.4s }, [x24], #0x10\n"
- "st1 { v21.4s }, [x23], #0x10\n"
- "st1 { v18.4s }, [x22], #0x10\n"
- "st1 { v29.4s }, [x21], #0x10\n"
- "st1 { v26.4s }, [x20], #0x10\n"
- "tbz x11, #1, 213f\n"
- "str d14, [x9], #0x8\n"
- "str d11, [x24], #0x8\n"
- "str d22, [x23], #0x8\n"
- "str d19, [x22], #0x8\n"
- "str d30, [x21], #0x8\n"
- "str d27, [x20], #0x8\n"
- "tbz x11, #0, 220f\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v11.s }[2], [x24]\n"
- "st1 { v22.s }[2], [x23]\n"
- "st1 { v19.s }[2], [x22]\n"
- "st1 { v30.s }[2], [x21]\n"
- "st1 { v27.s }[2], [x20]\n"
- "b 220f\n"
- "213:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x11, #0, 220f\n"
- "str s14, [x9, #0x0]\n"
- "str s11, [x24, #0x0]\n"
- "str s22, [x23, #0x0]\n"
- "str s19, [x22, #0x0]\n"
- "str s30, [x21, #0x0]\n"
- "str s27, [x20, #0x0]\n"
- "b 220f\n"
- "214:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x11, #1, 215f\n"
- "str d13, [x9], #0x8\n"
- "str d10, [x24], #0x8\n"
- "str d21, [x23], #0x8\n"
- "str d18, [x22], #0x8\n"
- "str d29, [x21], #0x8\n"
- "str d26, [x20], #0x8\n"
- "tbz x11, #0, 220f\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v10.s }[2], [x24]\n"
- "st1 { v21.s }[2], [x23]\n"
- "st1 { v18.s }[2], [x22]\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v26.s }[2], [x20]\n"
- "b 220f\n"
- "215:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x11, #0, 220f\n"
- "str s13, [x9, #0x0]\n"
- "str s10, [x24, #0x0]\n"
- "str s21, [x23, #0x0]\n"
- "str s18, [x22, #0x0]\n"
- "str s29, [x21, #0x0]\n"
- "str s26, [x20, #0x0]\n"
- "b 220f\n"
- "216:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x11, #2, 218f\n"
- "st1 { v7.4s }, [x9], #0x10\n"
- "st1 { v8.4s }, [x24], #0x10\n"
- "st1 { v15.4s }, [x23], #0x10\n"
- "st1 { v16.4s }, [x22], #0x10\n"
- "st1 { v23.4s }, [x21], #0x10\n"
- "st1 { v24.4s }, [x20], #0x10\n"
- "tbz x11, #1, 217f\n"
- "str d12, [x9], #0x8\n"
- "str d9, [x24], #0x8\n"
- "str d20, [x23], #0x8\n"
- "str d17, [x22], #0x8\n"
- "str d28, [x21], #0x8\n"
- "str d25, [x20], #0x8\n"
- "tbz x11, #0, 220f\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v9.s }[2], [x24]\n"
- "st1 { v20.s }[2], [x23]\n"
- "st1 { v17.s }[2], [x22]\n"
- "st1 { v28.s }[2], [x21]\n"
- "st1 { v25.s }[2], [x20]\n"
- "b 220f\n"
- "217:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x11, #0, 220f\n"
- "str s12, [x9, #0x0]\n"
- "str s9, [x24, #0x0]\n"
- "str s20, [x23, #0x0]\n"
- "str s17, [x22, #0x0]\n"
- "str s28, [x21, #0x0]\n"
- "str s25, [x20, #0x0]\n"
- "b 220f\n"
- "218:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x11, #1, 219f\n"
- "str d7, [x9], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d15, [x23], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d23, [x21], #0x8\n"
- "str d24, [x20], #0x8\n"
- "tbz x11, #0, 220f\n"
- "st1 { v7.s }[2], [x9]\n"
- "st1 { v8.s }[2], [x24]\n"
- "st1 { v15.s }[2], [x23]\n"
- "st1 { v16.s }[2], [x22]\n"
- "st1 { v23.s }[2], [x21]\n"
- "st1 { v24.s }[2], [x20]\n"
- "b 220f\n"
- "219:" // Height 6: Partial direct writeback: partial_1_0
- "str s7, [x9, #0x0]\n"
- "str s8, [x24, #0x0]\n"
- "str s15, [x23, #0x0]\n"
- "str s16, [x22, #0x0]\n"
- "str s23, [x21, #0x0]\n"
- "str s24, [x20, #0x0]\n"
- "220:" // Height 6: Partial direct writeback: Done
- "b 222f\n"
- "221:" // Height 6: Full writeback
- "str q7, [x9, #0x0]\n"
- "str q12, [x9, #0x10]\n"
- "str q13, [x9, #0x20]\n"
- "str q14, [x9, #0x30]\n"
- "add x9, x9, #0x40\n"
- "str q8, [x24, #0x0]\n"
- "str q9, [x24, #0x10]\n"
- "str q10, [x24, #0x20]\n"
- "str q11, [x24, #0x30]\n"
- "str q15, [x23, #0x0]\n"
- "str q20, [x23, #0x10]\n"
- "str q21, [x23, #0x20]\n"
- "str q22, [x23, #0x30]\n"
- "str q16, [x22, #0x0]\n"
- "str q17, [x22, #0x10]\n"
- "str q18, [x22, #0x20]\n"
- "str q19, [x22, #0x30]\n"
- "str q23, [x21, #0x0]\n"
- "str q28, [x21, #0x10]\n"
- "str q29, [x21, #0x20]\n"
- "str q30, [x21, #0x30]\n"
- "str q24, [x20, #0x0]\n"
- "str q25, [x20, #0x10]\n"
- "str q26, [x20, #0x20]\n"
- "str q27, [x20, #0x30]\n"
- "222:" // Height 6: Writeback done
- "subs x11, x11, #0x10\n"
- "bgt 187b\n"
- "subs %x[M], %x[M], #0x6\n"
- "beq 224f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 223f\n"
- "add x21, x21, #0x6\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "223:" // Update direct input
- "mov x20, #0x6\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "224:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index 4bb4c31577..16d241ff02 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
index 4ec75191b9..f12269be58 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
@@ -44,18 +44,18 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -87,72 +87,72 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"bgt 69f\n"
"beq 35f\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"tbz %x[flags], #0, 12f\n"
"cmp x8, #0x10\n"
"bge 11f\n"
"tbz x8, #3, 6f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"tbz x8, #2, 4f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"tbz x8, #1, 3f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"tbz x8, #0, 10f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"b 10f\n"
"3:" // Height 1: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 10f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"b 10f\n"
"4:" // Height 1: Partial accumulate: partial_2_8
"tbz x8, #1, 5f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"tbz x8, #0, 10f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"b 10f\n"
"5:" // Height 1: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 10f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"b 10f\n"
"6:" // Height 1: Partial accumulate: partial_4_0
"tbz x8, #2, 8f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"tbz x8, #1, 7f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"tbz x8, #0, 10f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"b 10f\n"
"7:" // Height 1: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 10f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"b 10f\n"
"8:" // Height 1: Partial accumulate: partial_2_0
"tbz x8, #1, 9f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"tbz x8, #0, 10f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"b 10f\n"
"9:" // Height 1: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"10:" // Height 1: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 13f\n"
"11:" // Height 1: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"b 13f\n"
"12:" // Height 1: no accumulate
"movi v8.4s, #0x0\n"
@@ -163,8 +163,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"mov x15, #0x0\n"
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -180,118 +180,118 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"blt 19f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 18f\n"
"17:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr d17, [x17, #0x20]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr d17, [x16, #0x20]\n"
+ "ldr x20, [x16, #0x28]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr d16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
- "ldr x20, [x17, #0x38]\n"
- "sub x14, x14, #0x10\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "ldr x22, [x13, #0x8]\n"
- "cmp x14, #0x20\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x38]\n"
"mov v16.d[1], x20\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr d17, [x17, #0x40]\n"
+ "ldr d17, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr d16, [x17, #0x50]\n"
- "ldr x20, [x17, #0x58]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr d16, [x16, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
"mov v16.d[1], x20\n"
".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr d17, [x17, #0x60]\n"
+ "ldr d17, [x16, #0x60]\n"
+ "ldr x20, [x16, #0x68]\n"
".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr d16, [x17, #0x70]\n"
- "ldr x20, [x17, #0x78]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
"mov v16.d[1], x20\n"
".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr d17, [x17, #0x80]\n"
+ "ldr d17, [x16, #0x80]\n"
+ "ldr x20, [x16, #0x88]\n"
".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr d16, [x17, #0x90]\n"
- "ldr x20, [x17, #0x98]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
+ "ldr d16, [x16, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
"mov v16.d[1], x20\n"
".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr d17, [x17, #0xa0]\n"
+ "ldr d17, [x16, #0xa0]\n"
+ "ldr x20, [x16, #0xa8]\n"
".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr d16, [x17, #0xb0]\n"
- "ldr x20, [x17, #0xb8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
"mov v16.d[1], x20\n"
".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr d17, [x17, #0xc0]\n"
+ "ldr d17, [x16, #0xc0]\n"
+ "ldr x20, [x16, #0xc8]\n"
".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr d16, [x17, #0xd0]\n"
- "ldr x20, [x17, #0xd8]\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
+ "ldr d16, [x16, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
"mov v16.d[1], x20\n"
".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr d17, [x17, #0xe0]\n"
+ "ldr d17, [x16, #0xe0]\n"
+ "ldr x20, [x16, #0xe8]\n"
".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr d16, [x17, #0xf0]\n"
- "ldr x20, [x17, #0xf8]\n"
- "add x17, x17, #0x100\n"
- "mov v17.d[1], x21\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
"mov v16.d[1], x20\n"
+ "add x13, x13, #0x10\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
+ "ldr x20, [x16, #0x8]\n"
".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "ldr x20, [x17, #0x18]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x22\n"
+ "sub x14, x14, #0x10\n"
+ "ldr d7, [x16, #0x10]\n"
+ "cmp x14, #0x20\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x21\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
- "add x13, x13, #0x10\n"
- "sub x14, x14, #0x10\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x17, #0x40]\n"
+ "ldr q17, [x16, #0x40]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x17, #0x50]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr q16, [x16, #0x50]\n"
".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x17, #0x60]\n"
+ "ldr q17, [x16, #0x60]\n"
".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x17, #0x70]\n"
+ "ldr q16, [x16, #0x70]\n"
".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
- "ldr q17, [x17, #0x80]\n"
+ "ldr q17, [x16, #0x80]\n"
".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr q16, [x17, #0x90]\n"
+ "ldr q16, [x16, #0x90]\n"
".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x17, #0xa0]\n"
+ "ldr q17, [x16, #0xa0]\n"
".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x17, #0xb0]\n"
+ "ldr q16, [x16, #0xb0]\n"
".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
- "ldr q17, [x17, #0xc0]\n"
+ "ldr q17, [x16, #0xc0]\n"
".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr q16, [x17, #0xd0]\n"
+ "ldr q16, [x16, #0xd0]\n"
".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
- "ldr q17, [x17, #0xe0]\n"
+ "ldr q17, [x16, #0xe0]\n"
".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr q16, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr q16, [x16, #0xf0]\n"
+ "add x13, x13, #0x10\n"
+ "sub x14, x14, #0x10\n"
".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
+ "add x16, x16, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 24f\n"
"cmp x14, #0x4\n"
@@ -299,16 +299,16 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"20:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr q17, [x17, #0x0]\n"
- "cmp x14, #0x4\n"
- "ldr q16, [x17, #0x10]\n"
- ".inst 0x6f92e228 // udot v8.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
- "add x17, x17, #0x40\n"
+ "ldr q17, [x16, #0x20]\n"
+ "cmp x14, #0x4\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n"
".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n"
+ "add x16, x16, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
"cbz x14, 24f\n"
@@ -320,165 +320,165 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
- ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x6f80e208 // udot v8.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
- "add x17, x17, #0x40\n"
- ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x20]\n"
+ ".inst 0x6f80e20a // udot v10.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "add x16, x16, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
"cmp x15, x20\n"
"bne 14b\n"
"cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"bge 33f\n"
"tbz x8, #3, 28f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"tbz x8, #2, 26f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"tbz x8, #1, 25f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"b 32f\n"
"25:" // Height 1: Partial direct writeback: partial_1_12
"tbz x8, #0, 32f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"b 32f\n"
"26:" // Height 1: Partial direct writeback: partial_2_8
"tbz x8, #1, 27f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"b 32f\n"
"27:" // Height 1: Partial direct writeback: partial_1_8
"tbz x8, #0, 32f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"b 32f\n"
"28:" // Height 1: Partial direct writeback: partial_4_0
"tbz x8, #2, 30f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"tbz x8, #1, 29f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"b 32f\n"
"29:" // Height 1: Partial direct writeback: partial_1_4
"tbz x8, #0, 32f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"b 32f\n"
"30:" // Height 1: Partial direct writeback: partial_2_0
"tbz x8, #1, 31f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"tbz x8, #0, 32f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"b 32f\n"
"31:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"32:" // Height 1: Partial direct writeback: Done
"b 34f\n"
"33:" // Height 1: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"34:" // Height 1: Writeback done
"subs x8, x8, #0x10\n"
"bgt 2b\n"
"b 206f\n"
"35:" // Height 2
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"36:" // Height 2: Column loop
"tbz %x[flags], #0, 46f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"bge 45f\n"
"tbz x8, #3, 40f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"tbz x8, #2, 38f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"tbz x8, #1, 37f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"b 44f\n"
"37:" // Height 2: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 44f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"b 44f\n"
"38:" // Height 2: Partial accumulate: partial_2_8
"tbz x8, #1, 39f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"b 44f\n"
"39:" // Height 2: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 44f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"b 44f\n"
"40:" // Height 2: Partial accumulate: partial_4_0
"tbz x8, #2, 42f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"tbz x8, #1, 41f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"b 44f\n"
"41:" // Height 2: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 44f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"b 44f\n"
"42:" // Height 2: Partial accumulate: partial_2_0
"tbz x8, #1, 43f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"tbz x8, #0, 44f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"b 44f\n"
"43:" // Height 2: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"44:" // Height 2: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 47f\n"
"45:" // Height 2: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -497,8 +497,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"mov x15, #0x0\n"
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -518,154 +518,154 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
"ldr q1, [x12, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 52f\n"
"51:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr d17, [x17, #0x20]\n"
+ "ldr d17, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr d16, [x17, #0x30]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x48]\n"
- "add x13, x13, #0x10\n"
- "add x12, x12, #0x10\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x21\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "mov v16.d[1], x20\n"
".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr d17, [x17, #0x40]\n"
+ "ldr d17, [x16, #0x40]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "ldr x20, [x16, #0x48]\n"
".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr d16, [x17, #0x50]\n"
+ "ldr d16, [x16, #0x50]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x68]\n"
- "ldr x23, [x13, #0x8]\n"
- "sub x14, x14, #0x10\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v16.d[1], x20\n"
".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr d17, [x17, #0x60]\n"
+ "ldr d17, [x16, #0x60]\n"
".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr d16, [x17, #0x70]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0x88]\n"
- "ldr x22, [x12, #0x8]\n"
- "cmp x14, #0x20\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x21\n"
".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ "mov v16.d[1], x20\n"
".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr d17, [x17, #0x80]\n"
+ "ldr d17, [x16, #0x80]\n"
".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "ldr x20, [x16, #0x88]\n"
".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr d16, [x17, #0x90]\n"
+ "ldr d16, [x16, #0x90]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xa8]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x16, #0x98]\n"
+ "mov v16.d[1], x20\n"
".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr d17, [x17, #0xa0]\n"
+ "ldr d17, [x16, #0xa0]\n"
".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr d16, [x17, #0xb0]\n"
- "mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xc8]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x21\n"
".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ "mov v16.d[1], x20\n"
".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr d17, [x17, #0xc0]\n"
+ "ldr d17, [x16, #0xc0]\n"
".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "ldr x20, [x16, #0xc8]\n"
".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr d16, [x17, #0xd0]\n"
+ "ldr d16, [x16, #0xd0]\n"
"mov v17.d[1], x20\n"
- "ldr x20, [x17, #0xe8]\n"
- "mov v16.d[1], x21\n"
+ "ldr x20, [x16, #0xd8]\n"
+ "mov v16.d[1], x20\n"
".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr d17, [x17, #0xe0]\n"
+ "ldr d17, [x16, #0xe0]\n"
".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr d16, [x17, #0xf0]\n"
- "mov v17.d[1], x20\n"
- "add x17, x17, #0x100\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v16.d[1], x21\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x21\n"
+ "add x13, x13, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
"ldr d1, [x12, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x17, #0x18]\n"
- "mov v0.d[1], x23\n"
- "mov v1.d[1], x22\n"
+ "sub x14, x14, #0x10\n"
+ "ldr d7, [x16, #0x10]\n"
+ "cmp x14, #0x20\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v1.d[1], x21\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x17, #0x40]\n"
+ "ldr q17, [x16, #0x40]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x17, #0x50]\n"
+ "ldr q16, [x16, #0x50]\n"
".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x17, #0x60]\n"
+ "ldr q17, [x16, #0x60]\n"
".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x17, #0x70]\n"
+ "ldr q16, [x16, #0x70]\n"
".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
- "ldr q17, [x17, #0x80]\n"
+ "ldr q17, [x16, #0x80]\n"
".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
- "ldr q16, [x17, #0x90]\n"
+ "ldr q16, [x16, #0x90]\n"
".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x17, #0xa0]\n"
+ "ldr q17, [x16, #0xa0]\n"
".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x17, #0xb0]\n"
+ "ldr q16, [x16, #0xb0]\n"
".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
- "ldr q17, [x17, #0xc0]\n"
+ "ldr q17, [x16, #0xc0]\n"
".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
- "ldr q16, [x17, #0xd0]\n"
+ "ldr q16, [x16, #0xd0]\n"
".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
- "ldr q17, [x17, #0xe0]\n"
+ "ldr q17, [x16, #0xe0]\n"
".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
- "ldr q16, [x17, #0xf0]\n"
+ "ldr q16, [x16, #0xf0]\n"
".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
@@ -678,16 +678,16 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"sub x14, x14, #0x4\n"
"ldr s18, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
+ "ldr q17, [x16, #0x0]\n"
".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n"
".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n"
".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n"
".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n"
@@ -705,16 +705,16 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr b0, [x13, #0x0]\n"
"ldr b1, [x12, #0x0]\n"
"57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q17, [x17, #0x0]\n"
- "ldr q16, [x17, #0x10]\n"
+ "ldr q17, [x16, #0x0]\n"
".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n"
- "ldr q17, [x17, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n"
- "ldr q16, [x17, #0x30]\n"
+ "ldr q16, [x16, #0x30]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
@@ -724,79 +724,79 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 48b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x17, x20, LSL #2\n"
"cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"bge 67f\n"
"tbz x8, #3, 62f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"tbz x8, #2, 60f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"tbz x8, #1, 59f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"b 66f\n"
"59:" // Height 2: Partial direct writeback: partial_1_12
"tbz x8, #0, 66f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"b 66f\n"
"60:" // Height 2: Partial direct writeback: partial_2_8
"tbz x8, #1, 61f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"b 66f\n"
"61:" // Height 2: Partial direct writeback: partial_1_8
"tbz x8, #0, 66f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"b 66f\n"
"62:" // Height 2: Partial direct writeback: partial_4_0
"tbz x8, #2, 64f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"tbz x8, #1, 63f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"b 66f\n"
"63:" // Height 2: Partial direct writeback: partial_1_4
"tbz x8, #0, 66f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"b 66f\n"
"64:" // Height 2: Partial direct writeback: partial_2_0
"tbz x8, #1, 65f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"tbz x8, #0, 66f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"b 66f\n"
"65:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"66:" // Height 2: Partial direct writeback: Done
"b 68f\n"
"67:" // Height 2: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -807,107 +807,107 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 206f\n"
"69:" // Height 3
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"70:" // Height 3: Column loop
"tbz %x[flags], #0, 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x17, x20, LSL #2\n"
"cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"bge 79f\n"
"tbz x8, #3, 74f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"tbz x8, #2, 72f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"tbz x8, #1, 71f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"b 78f\n"
"71:" // Height 3: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 78f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"b 78f\n"
"72:" // Height 3: Partial accumulate: partial_2_8
"tbz x8, #1, 73f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"b 78f\n"
"73:" // Height 3: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 78f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"b 78f\n"
"74:" // Height 3: Partial accumulate: partial_4_0
"tbz x8, #2, 76f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"tbz x8, #1, 75f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"b 78f\n"
"75:" // Height 3: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 78f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"b 78f\n"
"76:" // Height 3: Partial accumulate: partial_2_0
"tbz x8, #1, 77f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
"tbz x8, #0, 78f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"b 78f\n"
"77:" // Height 3: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
"78:" // Height 3: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 81f\n"
"79:" // Height 3: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -934,8 +934,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"mov x15, #0x0\n"
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -959,123 +959,123 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x14, #0x20\n"
"ldr q1, [x12, #0x0]\n"
"ldr q2, [x11, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 86f\n"
"85:" // Height 3: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr d21, [x17, #0x20]\n"
+ "ldr d21, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x13, x13, #0x10\n"
+ "mov v21.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x12, x12, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr d20, [x17, #0x30]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x48]\n"
- "add x11, x11, #0x10\n"
- "ldr x24, [x13, #0x8]\n"
+ "ldr d20, [x16, #0x30]\n"
"mov v20.d[1], x20\n"
".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr d21, [x17, #0x40]\n"
+ "ldr d21, [x16, #0x40]\n"
".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
- "ldr x23, [x12, #0x8]\n"
+ "mov v21.d[1], x21\n"
".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
- "ldr x22, [x11, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr d20, [x17, #0x50]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x68]\n"
- "sub x14, x14, #0x10\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr d20, [x16, #0x50]\n"
"mov v20.d[1], x20\n"
".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr d21, [x17, #0x60]\n"
+ "ldr d21, [x16, #0x60]\n"
".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
- "cmp x14, #0x20\n"
+ "mov v21.d[1], x21\n"
".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr d20, [x17, #0x70]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0x88]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr d20, [x16, #0x70]\n"
"mov v20.d[1], x20\n"
".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr d21, [x17, #0x80]\n"
+ "ldr d21, [x16, #0x80]\n"
".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr d20, [x17, #0x90]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xa8]\n"
+ "ldr d20, [x16, #0x90]\n"
"mov v20.d[1], x20\n"
".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
- "ldr x20, [x17, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr d21, [x17, #0xa0]\n"
+ "ldr d21, [x16, #0xa0]\n"
".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr d20, [x17, #0xb0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xc8]\n"
+ "ldr d20, [x16, #0xb0]\n"
"mov v20.d[1], x20\n"
".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
- "ldr x20, [x17, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr d21, [x17, #0xc0]\n"
+ "ldr d21, [x16, #0xc0]\n"
".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr d20, [x17, #0xd0]\n"
- "mov v21.d[1], x21\n"
- "ldr x21, [x17, #0xe8]\n"
+ "ldr d20, [x16, #0xd0]\n"
"mov v20.d[1], x20\n"
".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
- "ldr x20, [x17, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr d21, [x17, #0xe0]\n"
+ "ldr d21, [x16, #0xe0]\n"
".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
+ "mov v21.d[1], x21\n"
".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
+ "add x13, x13, #0x10\n"
".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr d20, [x17, #0xf0]\n"
- "mov v21.d[1], x21\n"
- "add x17, x17, #0x100\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr d20, [x16, #0xf0]\n"
"mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0x8]\n"
".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
- "ldr x20, [x17, #0x18]\n"
+ "ldr x23, [x13, #0x8]\n"
".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
"ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
"ldr d2, [x11, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x21\n"
- "mov v0.d[1], x24\n"
- "mov v1.d[1], x23\n"
- "mov v2.d[1], x22\n"
+ "sub x14, x14, #0x10\n"
+ "ldr d7, [x16, #0x10]\n"
+ "cmp x14, #0x20\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x23\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"mov v7.d[1], x20\n"
"bge 85b\n"
"86:" // Height 3: Multiply loop: Single iteration only
@@ -1084,66 +1084,66 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q21, [x17, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q20, [x17, #0x30]\n"
+ "ldr q20, [x16, #0x30]\n"
".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x17, #0x40]\n"
+ "ldr q21, [x16, #0x40]\n"
".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x17, #0x50]\n"
+ "ldr q20, [x16, #0x50]\n"
".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x17, #0x60]\n"
+ "ldr q21, [x16, #0x60]\n"
".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x17, #0x70]\n"
+ "ldr q20, [x16, #0x70]\n"
".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
- "ldr q21, [x17, #0x80]\n"
+ "ldr q21, [x16, #0x80]\n"
".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
- "ldr q20, [x17, #0x90]\n"
+ "ldr q20, [x16, #0x90]\n"
".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x17, #0xa0]\n"
+ "ldr q21, [x16, #0xa0]\n"
".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x17, #0xb0]\n"
+ "ldr q20, [x16, #0xb0]\n"
".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
- "ldr q21, [x17, #0xc0]\n"
+ "ldr q21, [x16, #0xc0]\n"
".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
- "ldr q20, [x17, #0xd0]\n"
+ "ldr q20, [x16, #0xd0]\n"
".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
- "ldr q21, [x17, #0xe0]\n"
+ "ldr q21, [x16, #0xe0]\n"
".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
- "ldr q20, [x17, #0xf0]\n"
+ "ldr q20, [x16, #0xf0]\n"
".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
@@ -1159,18 +1159,18 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr s23, [x12], #0x4\n"
"cmp x14, #0x4\n"
"ldr s22, [x11], #0x4\n"
- "ldr q21, [x17, #0x0]\n"
- "ldr q20, [x17, #0x10]\n"
+ "ldr q21, [x16, #0x0]\n"
".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n"
".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n"
- "ldr q21, [x17, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n"
".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n"
".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n"
- "ldr q20, [x17, #0x30]\n"
+ "ldr q20, [x16, #0x30]\n"
".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n"
".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n"
".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n"
@@ -1193,18 +1193,18 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr b1, [x12, #0x0]\n"
"ldr b2, [x11, #0x0]\n"
"91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q21, [x17, #0x0]\n"
- "ldr q20, [x17, #0x10]\n"
+ "ldr q21, [x16, #0x0]\n"
".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n"
".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n"
- "ldr q21, [x17, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n"
".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n"
- "ldr q20, [x17, #0x30]\n"
+ "ldr q20, [x16, #0x30]\n"
".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
@@ -1216,97 +1216,97 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 82b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"bge 101f\n"
"tbz x8, #3, 96f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v17.4s }, [x23], #0x10\n"
"tbz x8, #2, 94f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"tbz x8, #1, 93f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"b 100f\n"
"93:" // Height 3: Partial direct writeback: partial_1_12
"tbz x8, #0, 100f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"b 100f\n"
"94:" // Height 3: Partial direct writeback: partial_2_8
"tbz x8, #1, 95f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"b 100f\n"
"95:" // Height 3: Partial direct writeback: partial_1_8
"tbz x8, #0, 100f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"b 100f\n"
"96:" // Height 3: Partial direct writeback: partial_4_0
"tbz x8, #2, 98f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"tbz x8, #1, 97f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"b 100f\n"
"97:" // Height 3: Partial direct writeback: partial_1_4
"tbz x8, #0, 100f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"b 100f\n"
"98:" // Height 3: Partial direct writeback: partial_2_0
"tbz x8, #1, 99f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"tbz x8, #0, 100f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"b 100f\n"
"99:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"100:" // Height 3: Partial direct writeback: Done
"b 102f\n"
"101:" // Height 3: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -1321,38 +1321,38 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 206f\n"
"103:" // Height 4
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"104:" // Height 4: Column loop
"tbz %x[flags], #0, 114f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
"add x22, x23, x20, LSL #2\n"
"bge 113f\n"
"tbz x8, #3, 108f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"ld1 { v21.4s }, [x22], #0x10\n"
"tbz x8, #2, 106f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"ld1 { v22.4s }, [x22], #0x10\n"
"tbz x8, #1, 105f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
"ldr d23, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"ld1 { v23.s }[2], [x22]\n"
@@ -1360,20 +1360,20 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"105:" // Height 4: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 112f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"ldr s23, [x22, #0x0]\n"
"b 112f\n"
"106:" // Height 4: Partial accumulate: partial_2_8
"tbz x8, #1, 107f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
"ldr d22, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"ld1 { v22.s }[2], [x22]\n"
@@ -1381,25 +1381,25 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"107:" // Height 4: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 112f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"ldr s22, [x22, #0x0]\n"
"b 112f\n"
"108:" // Height 4: Partial accumulate: partial_4_0
"tbz x8, #2, 110f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"tbz x8, #1, 109f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
"ldr d21, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"ld1 { v21.s }[2], [x22]\n"
@@ -1407,38 +1407,38 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"109:" // Height 4: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 112f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"ldr s21, [x22, #0x0]\n"
"b 112f\n"
"110:" // Height 4: Partial accumulate: partial_2_0
"tbz x8, #1, 111f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
"ldr d20, [x22], #0x8\n"
"tbz x8, #0, 112f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"ld1 { v20.s }[2], [x22]\n"
"b 112f\n"
"111:" // Height 4: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
"ldr s20, [x22, #0x0]\n"
"112:" // Height 4: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 115f\n"
"113:" // Height 4: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -1473,8 +1473,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"mov x15, #0x0\n"
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1502,129 +1502,130 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr q1, [x12, #0x0]\n"
"ldr q2, [x11, #0x0]\n"
"ldr q3, [x10, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 120f\n"
"119:" // Height 4: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x21, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr d25, [x17, #0x20]\n"
+ "ldr d25, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x12, x12, #0x10\n"
+ "mov v25.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr d24, [x17, #0x30]\n"
+ "ldr d24, [x16, #0x30]\n"
+ "mov v24.d[1], x20\n"
".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
- "ldr x20, [x17, #0x48]\n"
".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr d25, [x17, #0x40]\n"
+ "ldr d25, [x16, #0x40]\n"
".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x58]\n"
+ "mov v25.d[1], x21\n"
".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
- "ldr x25, [x13, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
- "mov v25.d[1], x20\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr d24, [x17, #0x50]\n"
+ "ldr d24, [x16, #0x50]\n"
+ "mov v24.d[1], x20\n"
".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
- "ldr x20, [x17, #0x68]\n"
".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
- "ldr x24, [x12, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
+ "ldr x25, [x13, #0x8]\n"
".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr d25, [x17, #0x60]\n"
+ "ldr d25, [x16, #0x60]\n"
".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x78]\n"
+ "mov v25.d[1], x21\n"
".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
- "ldr x23, [x11, #0x8]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x24, [x12, #0x8]\n"
".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr d24, [x17, #0x70]\n"
+ "ldr d24, [x16, #0x70]\n"
+ "mov v24.d[1], x20\n"
".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
- "ldr x20, [x17, #0x88]\n"
".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
- "ldr x22, [x10, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
+ "ldr x23, [x11, #0x8]\n"
".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr d25, [x17, #0x80]\n"
+ "ldr d25, [x16, #0x80]\n"
".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
- "ldr x21, [x17, #0x98]\n"
+ "mov v25.d[1], x21\n"
".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
- "sub x14, x14, #0x10\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
- "mov v25.d[1], x20\n"
+ "ldr x22, [x10, #0x8]\n"
".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr d24, [x17, #0x90]\n"
+ "ldr d24, [x16, #0x90]\n"
+ "mov v24.d[1], x20\n"
".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
- "ldr x20, [x17, #0xa8]\n"
".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
- "cmp x14, #0x20\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
+ "sub x14, x14, #0x10\n"
".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr d25, [x17, #0xa0]\n"
+ "ldr d25, [x16, #0xa0]\n"
".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xb8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
- "mov v25.d[1], x20\n"
+ "cmp x14, #0x20\n"
".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr d24, [x17, #0xb0]\n"
+ "ldr d24, [x16, #0xb0]\n"
+ "mov v24.d[1], x20\n"
".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
- "ldr x20, [x17, #0xc8]\n"
".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr d25, [x17, #0xc0]\n"
+ "ldr d25, [x16, #0xc0]\n"
".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
- "ldr x21, [x17, #0xd8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
- "mov v25.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr d24, [x17, #0xd0]\n"
+ "ldr d24, [x16, #0xd0]\n"
+ "mov v24.d[1], x20\n"
".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
- "ldr x20, [x17, #0xe8]\n"
".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr d25, [x17, #0xe0]\n"
+ "ldr d25, [x16, #0xe0]\n"
".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
- "ldr x21, [x17, #0xf8]\n"
+ "mov v25.d[1], x21\n"
".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
- "mov v25.d[1], x20\n"
".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr d24, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr d24, [x16, #0xf0]\n"
+ "mov v24.d[1], x20\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
- "ldr x20, [x17, #0x8]\n"
- "mov v24.d[1], x21\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
@@ -1633,9 +1634,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d2, [x11, #0x0]\n"
".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
"ldr d3, [x10, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x20\n"
- "ldr x20, [x17, #0x18]\n"
+ "ldr d7, [x16, #0x10]\n"
+ "mov v6.d[1], x21\n"
"mov v0.d[1], x25\n"
"mov v1.d[1], x24\n"
"mov v2.d[1], x23\n"
@@ -1650,7 +1650,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q25, [x17, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1658,7 +1658,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q24, [x17, #0x30]\n"
+ "ldr q24, [x16, #0x30]\n"
".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
@@ -1666,64 +1666,64 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x17, #0x40]\n"
+ "ldr q25, [x16, #0x40]\n"
".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x17, #0x50]\n"
+ "ldr q24, [x16, #0x50]\n"
".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x17, #0x60]\n"
+ "ldr q25, [x16, #0x60]\n"
".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x17, #0x70]\n"
+ "ldr q24, [x16, #0x70]\n"
".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
- "ldr q25, [x17, #0x80]\n"
+ "ldr q25, [x16, #0x80]\n"
".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
- "ldr q24, [x17, #0x90]\n"
+ "ldr q24, [x16, #0x90]\n"
".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x17, #0xa0]\n"
+ "ldr q25, [x16, #0xa0]\n"
".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x17, #0xb0]\n"
+ "ldr q24, [x16, #0xb0]\n"
".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
- "ldr q25, [x17, #0xc0]\n"
+ "ldr q25, [x16, #0xc0]\n"
".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
- "ldr q24, [x17, #0xd0]\n"
+ "ldr q24, [x16, #0xd0]\n"
".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
- "ldr q25, [x17, #0xe0]\n"
+ "ldr q25, [x16, #0xe0]\n"
".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
- "ldr q24, [x17, #0xf0]\n"
+ "ldr q24, [x16, #0xf0]\n"
".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
@@ -1742,20 +1742,20 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x14, #0x4\n"
"ldr s27, [x11], #0x4\n"
"ldr s26, [x10], #0x4\n"
- "ldr q25, [x17, #0x0]\n"
- "ldr q24, [x17, #0x10]\n"
+ "ldr q25, [x16, #0x0]\n"
".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n"
".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n"
".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n"
- "ldr q25, [x17, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n"
".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n"
".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n"
".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n"
- "ldr q24, [x17, #0x30]\n"
+ "ldr q24, [x16, #0x30]\n"
".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n"
".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n"
".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n"
@@ -1783,20 +1783,20 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr b2, [x11, #0x0]\n"
"ldr b3, [x10, #0x0]\n"
"125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q25, [x17, #0x0]\n"
- "ldr q24, [x17, #0x10]\n"
+ "ldr q25, [x16, #0x0]\n"
".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n"
".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n"
".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n"
- "ldr q25, [x17, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n"
".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n"
".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n"
".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n"
- "ldr q24, [x17, #0x30]\n"
+ "ldr q24, [x16, #0x30]\n"
".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
@@ -1810,18 +1810,18 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 116b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"bge 135f\n"
"tbz x8, #3, 130f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
@@ -1829,96 +1829,96 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"st1 { v20.4s }, [x22], #0x10\n"
"st1 { v21.4s }, [x22], #0x10\n"
"tbz x8, #2, 128f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"st1 { v22.4s }, [x22], #0x10\n"
"tbz x8, #1, 127f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"str d23, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v23.s }[2], [x22]\n"
"b 134f\n"
"127:" // Height 4: Partial direct writeback: partial_1_12
"tbz x8, #0, 134f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"str s23, [x22, #0x0]\n"
"b 134f\n"
"128:" // Height 4: Partial direct writeback: partial_2_8
"tbz x8, #1, 129f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"str d22, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"st1 { v22.s }[2], [x22]\n"
"b 134f\n"
"129:" // Height 4: Partial direct writeback: partial_1_8
"tbz x8, #0, 134f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"str s22, [x22, #0x0]\n"
"b 134f\n"
"130:" // Height 4: Partial direct writeback: partial_4_0
"tbz x8, #2, 132f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v20.4s }, [x22], #0x10\n"
"tbz x8, #1, 131f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"str d21, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"st1 { v21.s }[2], [x22]\n"
"b 134f\n"
"131:" // Height 4: Partial direct writeback: partial_1_4
"tbz x8, #0, 134f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"str s21, [x22, #0x0]\n"
"b 134f\n"
"132:" // Height 4: Partial direct writeback: partial_2_0
"tbz x8, #1, 133f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"str d20, [x22], #0x8\n"
"tbz x8, #0, 134f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
"b 134f\n"
"133:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"str s20, [x22, #0x0]\n"
"134:" // Height 4: Partial direct writeback: Done
"b 136f\n"
"135:" // Height 4: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -1937,43 +1937,43 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 206f\n"
"137:" // Height 5
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"138:" // Height 5: Column loop
"tbz %x[flags], #0, 148f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
"add x21, x22, x20, LSL #2\n"
"bge 147f\n"
"tbz x8, #3, 142f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v25.4s }, [x21], #0x10\n"
"tbz x8, #2, 140f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v26.4s }, [x21], #0x10\n"
"tbz x8, #1, 139f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
"ldr d23, [x22], #0x8\n"
"ldr d27, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"ld1 { v23.s }[2], [x22]\n"
@@ -1982,7 +1982,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"139:" // Height 5: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 146f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"ldr s23, [x22, #0x0]\n"
@@ -1990,14 +1990,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 146f\n"
"140:" // Height 5: Partial accumulate: partial_2_8
"tbz x8, #1, 141f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
"ldr d22, [x22], #0x8\n"
"ldr d26, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"ld1 { v22.s }[2], [x22]\n"
@@ -2006,7 +2006,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"141:" // Height 5: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 146f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"ldr s22, [x22, #0x0]\n"
@@ -2014,20 +2014,20 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 146f\n"
"142:" // Height 5: Partial accumulate: partial_4_0
"tbz x8, #2, 144f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
"tbz x8, #1, 143f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
"ldr d21, [x22], #0x8\n"
"ldr d25, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"ld1 { v21.s }[2], [x22]\n"
@@ -2036,7 +2036,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"143:" // Height 5: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 146f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"ldr s21, [x22, #0x0]\n"
@@ -2044,34 +2044,34 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 146f\n"
"144:" // Height 5: Partial accumulate: partial_2_0
"tbz x8, #1, 145f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
"ldr d20, [x22], #0x8\n"
"ldr d24, [x21], #0x8\n"
"tbz x8, #0, 146f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"ld1 { v20.s }[2], [x22]\n"
"ld1 { v24.s }[2], [x21]\n"
"b 146f\n"
"145:" // Height 5: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
"ldr s20, [x22, #0x0]\n"
"ldr s24, [x21, #0x0]\n"
"146:" // Height 5: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 149f\n"
"147:" // Height 5: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -2114,8 +2114,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"mov x15, #0x0\n"
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2147,148 +2147,148 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr q2, [x11, #0x0]\n"
"ldr q3, [x10, #0x0]\n"
"ldr q4, [x9, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 154f\n"
"153:" // Height 5: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr d29, [x17, #0x20]\n"
+ "ldr d29, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x11, x11, #0x10\n"
+ "mov v29.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v29.d[1], x21\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr d28, [x17, #0x30]\n"
+ "ldr d28, [x16, #0x30]\n"
+ "mov v28.d[1], x20\n"
".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
- "ldr x26, [x13, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x13, #0x8]\n"
".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr d29, [x17, #0x40]\n"
+ "ldr d29, [x16, #0x40]\n"
".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
- "ldr x25, [x12, #0x8]\n"
+ "mov v29.d[1], x21\n"
".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
- "ldr x24, [x11, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
- "mov v29.d[1], x21\n"
+ "ldr x25, [x12, #0x8]\n"
".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x24, [x11, #0x8]\n"
".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr d28, [x17, #0x50]\n"
+ "ldr d28, [x16, #0x50]\n"
+ "mov v28.d[1], x20\n"
".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
- "ldr x23, [x10, #0x8]\n"
".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
- "ldr x22, [x9, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
+ "ldr x23, [x10, #0x8]\n"
".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "ldr x22, [x9, #0x8]\n"
".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr d29, [x17, #0x60]\n"
+ "ldr d29, [x16, #0x60]\n"
".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
- "sub x14, x14, #0x10\n"
+ "mov v29.d[1], x21\n"
".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
- "cmp x14, #0x20\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
- "mov v29.d[1], x21\n"
+ "sub x14, x14, #0x10\n"
".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "cmp x14, #0x20\n"
".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr d28, [x17, #0x70]\n"
+ "ldr d28, [x16, #0x70]\n"
+ "mov v28.d[1], x20\n"
".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr d29, [x17, #0x80]\n"
+ "ldr d29, [x16, #0x80]\n"
".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v29.d[1], x21\n"
".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
- "mov v29.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0xa8]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr d28, [x17, #0x90]\n"
+ "ldr d28, [x16, #0x90]\n"
+ "mov v28.d[1], x20\n"
".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xb8]\n"
".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr d29, [x17, #0xa0]\n"
+ "ldr d29, [x16, #0xa0]\n"
".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
- "mov v29.d[1], x21\n"
".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xc8]\n"
".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr d28, [x17, #0xb0]\n"
+ "ldr d28, [x16, #0xb0]\n"
+ "mov v28.d[1], x20\n"
".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xd8]\n"
".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr d29, [x17, #0xc0]\n"
+ "ldr d29, [x16, #0xc0]\n"
".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
- "mov v29.d[1], x21\n"
".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xe8]\n"
".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr d28, [x17, #0xd0]\n"
+ "ldr d28, [x16, #0xd0]\n"
+ "mov v28.d[1], x20\n"
".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0xf8]\n"
".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr d29, [x17, #0xe0]\n"
+ "ldr d29, [x16, #0xe0]\n"
".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
+ "mov v29.d[1], x21\n"
".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
- "mov v29.d[1], x21\n"
".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr d28, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr d28, [x16, #0xf0]\n"
+ "mov v28.d[1], x20\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
- "ldr x21, [x17, #0x8]\n"
- "mov v28.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0x18]\n"
".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
@@ -2299,7 +2299,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d3, [x10, #0x0]\n"
".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
"ldr d4, [x9, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
+ "ldr d7, [x16, #0x10]\n"
"mov v6.d[1], x21\n"
"mov v0.d[1], x26\n"
"mov v1.d[1], x25\n"
@@ -2318,7 +2318,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
"add x10, x10, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q29, [x17, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x9, x9, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2328,7 +2328,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q28, [x17, #0x30]\n"
+ "ldr q28, [x16, #0x30]\n"
".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
@@ -2337,75 +2337,75 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x17, #0x40]\n"
+ "ldr q29, [x16, #0x40]\n"
".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x17, #0x50]\n"
+ "ldr q28, [x16, #0x50]\n"
".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x17, #0x60]\n"
+ "ldr q29, [x16, #0x60]\n"
".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x17, #0x70]\n"
+ "ldr q28, [x16, #0x70]\n"
".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
- "ldr q29, [x17, #0x80]\n"
+ "ldr q29, [x16, #0x80]\n"
".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
- "ldr q28, [x17, #0x90]\n"
+ "ldr q28, [x16, #0x90]\n"
".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x17, #0xa0]\n"
+ "ldr q29, [x16, #0xa0]\n"
".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x17, #0xb0]\n"
+ "ldr q28, [x16, #0xb0]\n"
".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
- "ldr q29, [x17, #0xc0]\n"
+ "ldr q29, [x16, #0xc0]\n"
".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
- "ldr q28, [x17, #0xd0]\n"
+ "ldr q28, [x16, #0xd0]\n"
".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
- "ldr q29, [x17, #0xe0]\n"
+ "ldr q29, [x16, #0xe0]\n"
".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
- "ldr q28, [x17, #0xf0]\n"
+ "ldr q28, [x16, #0xf0]\n"
".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
@@ -2427,22 +2427,22 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr s0, [x11], #0x4\n"
"ldr s31, [x10], #0x4\n"
"ldr s30, [x9], #0x4\n"
- "ldr q29, [x17, #0x0]\n"
- "ldr q28, [x17, #0x10]\n"
+ "ldr q29, [x16, #0x0]\n"
".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n"
".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n"
".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n"
- "ldr q29, [x17, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n"
".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n"
".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n"
".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n"
- "ldr q28, [x17, #0x30]\n"
+ "ldr q28, [x16, #0x30]\n"
".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n"
@@ -2475,22 +2475,22 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr b3, [x10, #0x0]\n"
"ldr b4, [x9, #0x0]\n"
"159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q29, [x17, #0x0]\n"
- "ldr q28, [x17, #0x10]\n"
+ "ldr q29, [x16, #0x0]\n"
".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n"
".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n"
".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n"
- "ldr q29, [x17, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n"
".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n"
".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n"
".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n"
- "ldr q28, [x17, #0x30]\n"
+ "ldr q28, [x16, #0x30]\n"
".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
@@ -2506,20 +2506,20 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 150b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"bge 169f\n"
"tbz x8, #3, 164f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
@@ -2529,19 +2529,19 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"st1 { v24.4s }, [x21], #0x10\n"
"st1 { v25.4s }, [x21], #0x10\n"
"tbz x8, #2, 162f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"st1 { v22.4s }, [x22], #0x10\n"
"st1 { v26.4s }, [x21], #0x10\n"
"tbz x8, #1, 161f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"str d23, [x22], #0x8\n"
"str d27, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v23.s }[2], [x22]\n"
@@ -2549,7 +2549,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 168f\n"
"161:" // Height 5: Partial direct writeback: partial_1_12
"tbz x8, #0, 168f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"str s23, [x22, #0x0]\n"
@@ -2557,13 +2557,13 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 168f\n"
"162:" // Height 5: Partial direct writeback: partial_2_8
"tbz x8, #1, 163f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"str d22, [x22], #0x8\n"
"str d26, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"st1 { v22.s }[2], [x22]\n"
@@ -2571,7 +2571,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 168f\n"
"163:" // Height 5: Partial direct writeback: partial_1_8
"tbz x8, #0, 168f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"str s22, [x22, #0x0]\n"
@@ -2579,19 +2579,19 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 168f\n"
"164:" // Height 5: Partial direct writeback: partial_4_0
"tbz x8, #2, 166f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v20.4s }, [x22], #0x10\n"
"st1 { v24.4s }, [x21], #0x10\n"
"tbz x8, #1, 165f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"str d21, [x22], #0x8\n"
"str d25, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"st1 { v21.s }[2], [x22]\n"
@@ -2599,7 +2599,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 168f\n"
"165:" // Height 5: Partial direct writeback: partial_1_4
"tbz x8, #0, 168f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"str s21, [x22, #0x0]\n"
@@ -2607,20 +2607,20 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 168f\n"
"166:" // Height 5: Partial direct writeback: partial_2_0
"tbz x8, #1, 167f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"str d20, [x22], #0x8\n"
"str d24, [x21], #0x8\n"
"tbz x8, #0, 168f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
"st1 { v24.s }[2], [x21]\n"
"b 168f\n"
"167:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"str s20, [x22, #0x0]\n"
@@ -2628,11 +2628,11 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"168:" // Height 5: Partial direct writeback: Done
"b 170f\n"
"169:" // Height 5: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -2656,43 +2656,42 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"171:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x18\n"
- "ldr x16, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x16\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x17, %x[output_ptr]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"172:" // Height 6: Column loop
"tbz %x[flags], #0, 182f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
"add x20, x21, x20, LSL #2\n"
"bge 181f\n"
"tbz x8, #3, 176f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
"ld1 { v28.4s }, [x20], #0x10\n"
- "ld1 { v9.4s }, [x16], #0x10\n"
+ "ld1 { v9.4s }, [x17], #0x10\n"
"ld1 { v13.4s }, [x24], #0x10\n"
"ld1 { v17.4s }, [x23], #0x10\n"
"ld1 { v21.4s }, [x22], #0x10\n"
"ld1 { v25.4s }, [x21], #0x10\n"
"ld1 { v29.4s }, [x20], #0x10\n"
"tbz x8, #2, 174f\n"
- "ld1 { v10.4s }, [x16], #0x10\n"
+ "ld1 { v10.4s }, [x17], #0x10\n"
"ld1 { v14.4s }, [x24], #0x10\n"
"ld1 { v18.4s }, [x23], #0x10\n"
"ld1 { v22.4s }, [x22], #0x10\n"
"ld1 { v26.4s }, [x21], #0x10\n"
"ld1 { v30.4s }, [x20], #0x10\n"
"tbz x8, #1, 173f\n"
- "ldr d11, [x16], #0x8\n"
+ "ldr d11, [x17], #0x8\n"
"mov x25, #0x38\n"
"ldr d15, [x24], #0x8\n"
"ldr d19, [x23], #0x8\n"
@@ -2700,7 +2699,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d27, [x21], #0x8\n"
"ldr d31, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v11.s }[2], [x16]\n"
+ "ld1 { v11.s }[2], [x17]\n"
"ld1 { v15.s }[2], [x24]\n"
"ld1 { v19.s }[2], [x23]\n"
"ld1 { v23.s }[2], [x22]\n"
@@ -2710,7 +2709,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"173:" // Height 6: Partial accumulate: partial_1_12
"mov x25, #0x30\n"
"tbz x8, #0, 180f\n"
- "ldr s11, [x16, #0x0]\n"
+ "ldr s11, [x17, #0x0]\n"
"ldr s15, [x24, #0x0]\n"
"ldr s19, [x23, #0x0]\n"
"ldr s23, [x22, #0x0]\n"
@@ -2719,7 +2718,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 180f\n"
"174:" // Height 6: Partial accumulate: partial_2_8
"tbz x8, #1, 175f\n"
- "ldr d10, [x16], #0x8\n"
+ "ldr d10, [x17], #0x8\n"
"mov x25, #0x28\n"
"ldr d14, [x24], #0x8\n"
"ldr d18, [x23], #0x8\n"
@@ -2727,7 +2726,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d26, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v10.s }[2], [x16]\n"
+ "ld1 { v10.s }[2], [x17]\n"
"ld1 { v14.s }[2], [x24]\n"
"ld1 { v18.s }[2], [x23]\n"
"ld1 { v22.s }[2], [x22]\n"
@@ -2737,7 +2736,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"175:" // Height 6: Partial accumulate: partial_1_8
"mov x25, #0x20\n"
"tbz x8, #0, 180f\n"
- "ldr s10, [x16, #0x0]\n"
+ "ldr s10, [x17, #0x0]\n"
"ldr s14, [x24, #0x0]\n"
"ldr s18, [x23, #0x0]\n"
"ldr s22, [x22, #0x0]\n"
@@ -2746,14 +2745,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 180f\n"
"176:" // Height 6: Partial accumulate: partial_4_0
"tbz x8, #2, 178f\n"
- "ld1 { v8.4s }, [x16], #0x10\n"
+ "ld1 { v8.4s }, [x17], #0x10\n"
"ld1 { v12.4s }, [x24], #0x10\n"
"ld1 { v16.4s }, [x23], #0x10\n"
"ld1 { v20.4s }, [x22], #0x10\n"
"ld1 { v24.4s }, [x21], #0x10\n"
"ld1 { v28.4s }, [x20], #0x10\n"
"tbz x8, #1, 177f\n"
- "ldr d9, [x16], #0x8\n"
+ "ldr d9, [x17], #0x8\n"
"mov x25, #0x18\n"
"ldr d13, [x24], #0x8\n"
"ldr d17, [x23], #0x8\n"
@@ -2761,7 +2760,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d25, [x21], #0x8\n"
"ldr d29, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "ld1 { v9.s }[2], [x17]\n"
"ld1 { v13.s }[2], [x24]\n"
"ld1 { v17.s }[2], [x23]\n"
"ld1 { v21.s }[2], [x22]\n"
@@ -2771,7 +2770,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"177:" // Height 6: Partial accumulate: partial_1_4
"mov x25, #0x10\n"
"tbz x8, #0, 180f\n"
- "ldr s9, [x16, #0x0]\n"
+ "ldr s9, [x17, #0x0]\n"
"ldr s13, [x24, #0x0]\n"
"ldr s17, [x23, #0x0]\n"
"ldr s21, [x22, #0x0]\n"
@@ -2780,7 +2779,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 180f\n"
"178:" // Height 6: Partial accumulate: partial_2_0
"tbz x8, #1, 179f\n"
- "ldr d8, [x16], #0x8\n"
+ "ldr d8, [x17], #0x8\n"
"mov x25, #0x8\n"
"ldr d12, [x24], #0x8\n"
"ldr d16, [x23], #0x8\n"
@@ -2788,7 +2787,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d24, [x21], #0x8\n"
"ldr d28, [x20], #0x8\n"
"tbz x8, #0, 180f\n"
- "ld1 { v8.s }[2], [x16]\n"
+ "ld1 { v8.s }[2], [x17]\n"
"ld1 { v12.s }[2], [x24]\n"
"ld1 { v16.s }[2], [x23]\n"
"ld1 { v20.s }[2], [x22]\n"
@@ -2796,7 +2795,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ld1 { v28.s }[2], [x20]\n"
"b 180f\n"
"179:" // Height 6: Partial accumulate: partial_1_0
- "ldr s8, [x16, #0x0]\n"
+ "ldr s8, [x17, #0x0]\n"
"mov x25, #0x0\n"
"ldr s12, [x24, #0x0]\n"
"ldr s16, [x23, #0x0]\n"
@@ -2804,13 +2803,13 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr s24, [x21, #0x0]\n"
"ldr s28, [x20, #0x0]\n"
"180:" // Height 6: Partial accumulate: Done
- "sub x16, x16, x25\n"
+ "sub x17, x17, x25\n"
"b 183f\n"
"181:" // Height 6: full accumulate
- "ldr q8, [x16, #0x0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
+ "ldr q8, [x17, #0x0]\n"
+ "ldr q9, [x17, #0x10]\n"
+ "ldr q10, [x17, #0x20]\n"
+ "ldr q11, [x17, #0x30]\n"
"ldr q12, [x24, #0x0]\n"
"ldr q13, [x24, #0x10]\n"
"ldr q14, [x24, #0x20]\n"
@@ -2861,8 +2860,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"mov x15, #0x0\n"
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
"ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2898,14 +2897,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr q3, [x10, #0x0]\n"
"ldr q4, [x9, #0x0]\n"
"ldr q5, [x28, #0x0]\n"
- "ldr q6, [x17, #0x0]\n"
- "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x16, #0x0]\n"
+ "ldr q7, [x16, #0x10]\n"
"blt 188f\n"
"187:" // Height 6: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x21, [x17, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x20, [x17, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
@@ -2913,151 +2912,151 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
"add x11, x11, #0x10\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d6, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x21\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x48]\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
"add x28, x28, #0x10\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr d7, [x17, #0x30]\n"
+ "ldr d7, [x16, #0x30]\n"
+ "mov v7.d[1], x20\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr x27, [x13, #0x8]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x26, [x12, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr x27, [x13, #0x8]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x20, [x17, #0x58]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
"ldr x25, [x11, #0x8]\n"
".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr d6, [x17, #0x40]\n"
+ "ldr d6, [x16, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr x24, [x10, #0x8]\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "mov v6.d[1], x21\n"
+ "ldr x24, [x10, #0x8]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x21, [x17, #0x68]\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
"ldr x22, [x28, #0x8]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr d7, [x17, #0x50]\n"
+ "ldr d7, [x16, #0x50]\n"
+ "mov v7.d[1], x20\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "sub x14, x14, #0x10\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "cmp x14, #0x20\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "sub x14, x14, #0x10\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x78]\n"
+ "cmp x14, #0x20\n"
".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr d6, [x17, #0x60]\n"
+ "ldr d6, [x16, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v6.d[1], x21\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "mov v6.d[1], x21\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0x88]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr d7, [x17, #0x70]\n"
+ "ldr d7, [x16, #0x70]\n"
+ "mov v7.d[1], x20\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr x20, [x17, #0x98]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr d6, [x17, #0x80]\n"
+ "ldr d6, [x16, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "mov v6.d[1], x21\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "mov v6.d[1], x21\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr x21, [x17, #0xa8]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr d7, [x17, #0x90]\n"
+ "ldr d7, [x16, #0x90]\n"
+ "mov v7.d[1], x20\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xb8]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr d6, [x17, #0xa0]\n"
+ "ldr d6, [x16, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "mov v6.d[1], x21\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xc8]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr d7, [x17, #0xb0]\n"
+ "ldr d7, [x16, #0xb0]\n"
+ "mov v7.d[1], x20\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr x20, [x17, #0xd8]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr d6, [x17, #0xc0]\n"
+ "ldr d6, [x16, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "mov v6.d[1], x21\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr x21, [x17, #0xe8]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr d7, [x17, #0xd0]\n"
+ "ldr d7, [x16, #0xd0]\n"
+ "mov v7.d[1], x20\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0xf8]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr d6, [x17, #0xe0]\n"
+ "ldr d6, [x16, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "mov v6.d[1], x21\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "mov v6.d[1], x21\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "add x17, x17, #0x100\n"
+ "ldr d7, [x16, #0xf0]\n"
+ "mov v7.d[1], x20\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x21, [x17, #0x8]\n"
- "mov v7.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- "ldr x20, [x17, #0x18]\n"
".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
- "ldr d6, [x17, #0x0]\n"
+ "ldr d6, [x16, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
@@ -3070,7 +3069,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr d4, [x9, #0x0]\n"
".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
"ldr d5, [x28, #0x0]\n"
- "ldr d7, [x17, #0x10]\n"
+ "ldr d7, [x16, #0x10]\n"
"mov v6.d[1], x21\n"
"mov v0.d[1], x27\n"
"mov v1.d[1], x26\n"
@@ -3092,7 +3091,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
"add x9, x9, #0x10\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q6, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x28, x28, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -3104,7 +3103,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x17, #0x30]\n"
+ "ldr q7, [x16, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
"prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
@@ -3114,86 +3113,86 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x17, #0x40]\n"
+ "ldr q6, [x16, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x17, #0x50]\n"
+ "ldr q7, [x16, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x17, #0x60]\n"
+ "ldr q6, [x16, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x17, #0x70]\n"
+ "ldr q7, [x16, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x17, #0x80]\n"
+ "ldr q6, [x16, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x17, #0x90]\n"
+ "ldr q7, [x16, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x17, #0xa0]\n"
+ "ldr q6, [x16, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x17, #0xb0]\n"
+ "ldr q7, [x16, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x17, #0xc0]\n"
+ "ldr q6, [x16, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x17, #0xd0]\n"
+ "ldr q7, [x16, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x17, #0xe0]\n"
+ "ldr q6, [x16, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x17, #0xf0]\n"
+ "ldr q7, [x16, #0xf0]\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "add x17, x17, #0x100\n"
+ "add x16, x16, #0x100\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
@@ -3218,24 +3217,24 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr s4, [x10], #0x4\n"
"ldr s3, [x9], #0x4\n"
"ldr s2, [x28], #0x4\n"
- "ldr q1, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
+ "ldr q1, [x16, #0x0]\n"
".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n"
+ "ldr q0, [x16, #0x10]\n"
".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n"
".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n"
".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n"
".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n"
".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n"
- "ldr q1, [x17, #0x20]\n"
+ "ldr q1, [x16, #0x20]\n"
".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n"
".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n"
".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n"
".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n"
".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n"
".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n"
- "ldr q0, [x17, #0x30]\n"
+ "ldr q0, [x16, #0x30]\n"
".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n"
".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n"
".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n"
@@ -3273,24 +3272,24 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"ldr b4, [x9, #0x0]\n"
"ldr b5, [x28, #0x0]\n"
"193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x17, #0x0]\n"
- "ldr q6, [x17, #0x10]\n"
+ "ldr q7, [x16, #0x0]\n"
".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x16, #0x10]\n"
".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x17, #0x20]\n"
+ "ldr q7, [x16, #0x20]\n"
".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x17, #0x30]\n"
+ "ldr q6, [x16, #0x30]\n"
".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n"
- "add x17, x17, #0x40\n"
+ "add x16, x16, #0x40\n"
".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n"
@@ -3308,22 +3307,22 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x15, x20\n"
"bne 184b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x8, #0x10\n"
- "prfm pstl1keep, [x16, #0x0]\n"
- "add x24, x16, x20, LSL #2\n"
+ "add x24, x17, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add x20, x21, x20, LSL #2\n"
+ "cmp x8, #0x10\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"prfm pstl1keep, [x20, #0x0]\n"
"bge 203f\n"
"tbz x8, #3, 198f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
- "st1 { v9.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
+ "st1 { v9.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v13.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
@@ -3335,21 +3334,21 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"st1 { v28.4s }, [x20], #0x10\n"
"st1 { v29.4s }, [x20], #0x10\n"
"tbz x8, #2, 196f\n"
- "st1 { v10.4s }, [x16], #0x10\n"
+ "st1 { v10.4s }, [x17], #0x10\n"
"st1 { v14.4s }, [x24], #0x10\n"
"st1 { v18.4s }, [x23], #0x10\n"
"st1 { v22.4s }, [x22], #0x10\n"
"st1 { v26.4s }, [x21], #0x10\n"
"st1 { v30.4s }, [x20], #0x10\n"
"tbz x8, #1, 195f\n"
- "str d11, [x16], #0x8\n"
+ "str d11, [x17], #0x8\n"
"str d15, [x24], #0x8\n"
"str d19, [x23], #0x8\n"
"str d23, [x22], #0x8\n"
"str d27, [x21], #0x8\n"
"str d31, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v11.s }[2], [x16]\n"
+ "st1 { v11.s }[2], [x17]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v23.s }[2], [x22]\n"
@@ -3358,7 +3357,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 202f\n"
"195:" // Height 6: Partial direct writeback: partial_1_12
"tbz x8, #0, 202f\n"
- "str s11, [x16, #0x0]\n"
+ "str s11, [x17, #0x0]\n"
"str s15, [x24, #0x0]\n"
"str s19, [x23, #0x0]\n"
"str s23, [x22, #0x0]\n"
@@ -3367,14 +3366,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 202f\n"
"196:" // Height 6: Partial direct writeback: partial_2_8
"tbz x8, #1, 197f\n"
- "str d10, [x16], #0x8\n"
+ "str d10, [x17], #0x8\n"
"str d14, [x24], #0x8\n"
"str d18, [x23], #0x8\n"
"str d22, [x22], #0x8\n"
"str d26, [x21], #0x8\n"
"str d30, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v10.s }[2], [x16]\n"
+ "st1 { v10.s }[2], [x17]\n"
"st1 { v14.s }[2], [x24]\n"
"st1 { v18.s }[2], [x23]\n"
"st1 { v22.s }[2], [x22]\n"
@@ -3383,7 +3382,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 202f\n"
"197:" // Height 6: Partial direct writeback: partial_1_8
"tbz x8, #0, 202f\n"
- "str s10, [x16, #0x0]\n"
+ "str s10, [x17, #0x0]\n"
"str s14, [x24, #0x0]\n"
"str s18, [x23, #0x0]\n"
"str s22, [x22, #0x0]\n"
@@ -3392,21 +3391,21 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 202f\n"
"198:" // Height 6: Partial direct writeback: partial_4_0
"tbz x8, #2, 200f\n"
- "st1 { v8.4s }, [x16], #0x10\n"
+ "st1 { v8.4s }, [x17], #0x10\n"
"st1 { v12.4s }, [x24], #0x10\n"
"st1 { v16.4s }, [x23], #0x10\n"
"st1 { v20.4s }, [x22], #0x10\n"
"st1 { v24.4s }, [x21], #0x10\n"
"st1 { v28.4s }, [x20], #0x10\n"
"tbz x8, #1, 199f\n"
- "str d9, [x16], #0x8\n"
+ "str d9, [x17], #0x8\n"
"str d13, [x24], #0x8\n"
"str d17, [x23], #0x8\n"
"str d21, [x22], #0x8\n"
"str d25, [x21], #0x8\n"
"str d29, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v9.s }[2], [x16]\n"
+ "st1 { v9.s }[2], [x17]\n"
"st1 { v13.s }[2], [x24]\n"
"st1 { v17.s }[2], [x23]\n"
"st1 { v21.s }[2], [x22]\n"
@@ -3415,7 +3414,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 202f\n"
"199:" // Height 6: Partial direct writeback: partial_1_4
"tbz x8, #0, 202f\n"
- "str s9, [x16, #0x0]\n"
+ "str s9, [x17, #0x0]\n"
"str s13, [x24, #0x0]\n"
"str s17, [x23, #0x0]\n"
"str s21, [x22, #0x0]\n"
@@ -3424,14 +3423,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"b 202f\n"
"200:" // Height 6: Partial direct writeback: partial_2_0
"tbz x8, #1, 201f\n"
- "str d8, [x16], #0x8\n"
+ "str d8, [x17], #0x8\n"
"str d12, [x24], #0x8\n"
"str d16, [x23], #0x8\n"
"str d20, [x22], #0x8\n"
"str d24, [x21], #0x8\n"
"str d28, [x20], #0x8\n"
"tbz x8, #0, 202f\n"
- "st1 { v8.s }[2], [x16]\n"
+ "st1 { v8.s }[2], [x17]\n"
"st1 { v12.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
@@ -3439,7 +3438,7 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"st1 { v28.s }[2], [x20]\n"
"b 202f\n"
"201:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x16, #0x0]\n"
+ "str s8, [x17, #0x0]\n"
"str s12, [x24, #0x0]\n"
"str s16, [x23, #0x0]\n"
"str s20, [x22, #0x0]\n"
@@ -3448,11 +3447,11 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"202:" // Height 6: Partial direct writeback: Done
"b 204f\n"
"203:" // Height 6: Full writeback
- "str q8, [x16, #0x0]\n"
- "str q9, [x16, #0x10]\n"
- "str q10, [x16, #0x20]\n"
- "str q11, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "str q8, [x17, #0x0]\n"
+ "str q9, [x17, #0x10]\n"
+ "str q10, [x17, #0x20]\n"
+ "str q11, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
"str q12, [x24, #0x0]\n"
"str q13, [x24, #0x10]\n"
"str q14, [x24, #0x20]\n"
@@ -3488,8 +3487,8 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
index 8b7f5afb7e..d3367b959a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -44,18 +44,18 @@ void a64_hybrid_u8u32_dot_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -88,7 +88,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"beq 35f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"tbz %x[flags], #0, 12f\n"
"cmp x11, #0x10\n"
@@ -163,8 +163,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"mov x28, #0x0\n"
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -188,10 +188,6 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr q17, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
@@ -216,21 +212,22 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr q17, [x10, #0xe0]\n"
".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0x0]\n"
".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
+ "cmp x27, #0x20\n"
+ "add x10, x10, #0x100\n"
+ "ldr q6, [x10, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
@@ -255,26 +252,29 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr q17, [x10, #0xe0]\n"
".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
"ldr q16, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 24f\n"
"cmp x27, #0x4\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Odd block loop
"ldr s18, [x26], #0x4\n"
- "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n"
"sub x27, x27, #0x4\n"
"ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
- ".inst 0x6f92e228 // udot v8.4s, v17.16b, v18.4b[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n"
+ "cmp x27, #0x4\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n"
".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n"
+ "add x10, x10, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 24f\n"
@@ -289,12 +289,12 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr q17, [x10, #0x0]\n"
"ldr q16, [x10, #0x10]\n"
".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
- "ldr q17, [x10, #0x20]\n"
".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "add x10, x10, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -362,7 +362,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"35:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"36:" // Height 2: Column loop
"tbz %x[flags], #0, 46f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -463,8 +463,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"mov x28, #0x0\n"
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -496,22 +496,22 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "cmp x27, #0x20\n"
".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
"ldr q16, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
"ldr q17, [x10, #0x80]\n"
@@ -555,18 +555,18 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
"ldr q17, [x10, #0x40]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x50]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
"ldr q17, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
"ldr q16, [x10, #0x70]\n"
@@ -607,18 +607,18 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr s19, [x26], #0x4\n"
"ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr q17, [x10, #0x0]\n"
"ldr q16, [x10, #0x10]\n"
- "cmp x27, #0x4\n"
".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n"
".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n"
"ldr q17, [x10, #0x20]\n"
".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n"
".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n"
".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n"
+ "add x10, x10, #0x40\n"
".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n"
".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n"
"bge 54b\n"
@@ -643,9 +643,9 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n"
"ldr q16, [x10, #0x30]\n"
- "add x10, x10, #0x40\n"
".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
+ "add x10, x10, #0x40\n"
".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
"58:" // Height 2: Multiply loop: No odd multiplies
@@ -654,9 +654,9 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x28, x20\n"
"bne 48b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "add x24, x9, x20, LSL #2\n"
"prfm pstl1keep, [x24, #0x0]\n"
"bge 67f\n"
"tbz x11, #3, 62f\n"
@@ -738,12 +738,12 @@ void a64_hybrid_u8u32_dot_6x16 (
"69:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"70:" // Height 3: Column loop
"tbz %x[flags], #0, 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x23, x24, x20, LSL #2\n"
"bge 79f\n"
"tbz x11, #3, 74f\n"
@@ -864,8 +864,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"mov x28, #0x0\n"
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -905,18 +905,18 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
@@ -983,14 +983,14 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
"ldr q21, [x10, #0x40]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
"ldr q20, [x10, #0x50]\n"
@@ -1049,12 +1049,12 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr s24, [x26], #0x4\n"
"ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s22, [x24], #0x4\n"
"ldr q21, [x10, #0x0]\n"
- "cmp x27, #0x4\n"
- "ldr q20, [x10, #0x10]\n"
".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n"
".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n"
+ "ldr q20, [x10, #0x10]\n"
".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n"
"ldr q21, [x10, #0x20]\n"
".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n"
@@ -1108,11 +1108,11 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x28, x20\n"
"bne 82b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"bge 101f\n"
"tbz x11, #3, 96f\n"
@@ -1214,13 +1214,13 @@ void a64_hybrid_u8u32_dot_6x16 (
"103:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"104:" // Height 4: Column loop
"tbz %x[flags], #0, 114f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x22, x23, x20, LSL #2\n"
"bge 113f\n"
"tbz x11, #3, 108f\n"
@@ -1365,8 +1365,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"mov x28, #0x0\n"
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1506,14 +1506,14 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
"ldr q24, [x10, #0x30]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
@@ -1591,9 +1591,9 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr s29, [x26], #0x4\n"
"ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s27, [x24], #0x4\n"
"ldr s26, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr q25, [x10, #0x0]\n"
"ldr q24, [x10, #0x10]\n"
".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n"
@@ -1662,13 +1662,13 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x28, x20\n"
"bne 116b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"bge 135f\n"
"tbz x11, #3, 130f\n"
@@ -1790,14 +1790,14 @@ void a64_hybrid_u8u32_dot_6x16 (
"137:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"138:" // Height 5: Column loop
"tbz %x[flags], #0, 148f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x21, x22, x20, LSL #2\n"
"bge 147f\n"
"tbz x11, #3, 142f\n"
@@ -1966,8 +1966,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"mov x28, #0x0\n"
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2133,12 +2133,12 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x22, x22, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "sub x27, x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
"ldr q28, [x10, #0x30]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
@@ -2233,14 +2233,14 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s0, [x24], #0x4\n"
"ldr s31, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr s30, [x22], #0x4\n"
"ldr q29, [x10, #0x0]\n"
- "ldr q28, [x10, #0x10]\n"
".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n"
".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
+ "ldr q28, [x10, #0x10]\n"
".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n"
".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n"
".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n"
@@ -2316,15 +2316,15 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x28, x20\n"
"bne 150b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"bge 169f\n"
"tbz x11, #3, 164f\n"
@@ -2465,20 +2465,19 @@ void a64_hybrid_u8u32_dot_6x16 (
"b 206f\n"
"171:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"172:" // Height 6: Column loop
"tbz %x[flags], #0, 182f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x20, x21, x20, LSL #2\n"
"bge 181f\n"
"tbz x11, #3, 176f\n"
@@ -2671,8 +2670,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"mov x28, #0x0\n"
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2862,18 +2861,18 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
"ldr q7, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2979,9 +2978,9 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr s7, [x26], #0x4\n"
"ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"ldr s5, [x24], #0x4\n"
"ldr s4, [x23], #0x4\n"
- "cmp x27, #0x4\n"
"ldr s3, [x22], #0x4\n"
"ldr s2, [x21], #0x4\n"
"ldr q1, [x10, #0x0]\n"
@@ -3074,16 +3073,16 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x28, x20\n"
"bne 184b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"add x24, x9, x20, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"add x23, x24, x20, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"add x22, x23, x20, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"add x20, x21, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
"prfm pstl1keep, [x20, #0x0]\n"
"bge 203f\n"
@@ -3254,8 +3253,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index baa4e28e88..09fba7e253 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -70,7 +70,7 @@ public:
return true;
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+ StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
index 790a350838..8c6fbd4c83 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
@@ -44,18 +44,18 @@ void a64_hybrid_u8u32_mmla_6x16 (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -88,7 +88,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"beq 38f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"tbz %x[flags], #0, 13f\n"
"cmp x11, #0x10\n"
@@ -176,8 +176,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"mov x28, #0x0\n"
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -197,12 +197,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"trn1 v19.2d, v1.2d, v20.2d\n"
- "trn2 v1.2d, v1.2d, v20.2d\n"
".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
@@ -215,6 +210,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
@@ -231,38 +227,39 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "cmp x27, #0x20\n"
".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
- "ldr q7, [x10, #0x0]\n"
".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
+ "add x10, x10, #0x100\n"
+ "ldr q7, [x10, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
- "add x26, x26, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "trn1 v19.2d, v1.2d, v17.2d\n"
- "trn2 v1.2d, v1.2d, v17.2d\n"
- ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
+ "trn1 v20.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e87a688 // ummla v8.4s, v20.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
- ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
+ ".inst 0x6e86a68c // ummla v12.4s, v20.16b, v6.16b\n"
"ldr q17, [x10, #0x30]\n"
- ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e92a689 // ummla v9.4s, v20.16b, v18.16b\n"
"ldr q18, [x10, #0x40]\n"
- ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
+ ".inst 0x6e91a68d // ummla v13.4s, v20.16b, v17.16b\n"
"ldr q17, [x10, #0x50]\n"
- ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
- "ldr q20, [x10, #0x60]\n"
- ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ ".inst 0x6e92a68a // ummla v10.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x60]\n"
+ ".inst 0x6e91a68e // ummla v14.4s, v20.16b, v17.16b\n"
"ldr q18, [x10, #0x70]\n"
- ".inst 0x6e94a66b // ummla v11.4s, v19.16b, v20.16b\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e93a68b // ummla v11.4s, v20.16b, v19.16b\n"
"ldr q17, [x10, #0x80]\n"
- ".inst 0x6e92a66f // ummla v15.4s, v19.16b, v18.16b\n"
- "ldr q20, [x10, #0x90]\n"
+ ".inst 0x6e92a68f // ummla v15.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x90]\n"
".inst 0x6e91a428 // ummla v8.4s, v1.16b, v17.16b\n"
"ldr q18, [x10, #0xa0]\n"
- ".inst 0x6e94a42c // ummla v12.4s, v1.16b, v20.16b\n"
+ ".inst 0x6e93a42c // ummla v12.4s, v1.16b, v19.16b\n"
"ldr q17, [x10, #0xb0]\n"
".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n"
"ldr q18, [x10, #0xc0]\n"
@@ -272,21 +269,22 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 27f\n"
"cmp x27, #0x8\n"
"blt 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
"ldr d19, [x26], #0x8\n"
- "ldr q20, [x10, #0x0]\n"
- "sub x27, x27, #0x8\n"
+ "ldr q18, [x10, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
"ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v19.2d, v19.2d, v18.2d\n"
- ".inst 0x6e94a668 // ummla v8.4s, v19.16b, v20.16b\n"
+ ".inst 0x6e92a668 // ummla v8.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e91a66c // ummla v12.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x30]\n"
@@ -298,9 +296,11 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
+ "add x10, x10, #0x80\n"
"bge 21b\n"
"22:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 27f\n"
@@ -324,24 +324,24 @@ void a64_hybrid_u8u32_mmla_6x16 (
"25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x26, #0x0]\n"
"26:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q24, [x10, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
+ "ldr q23, [x10, #0x0]\n"
+ "ldr q18, [x10, #0x10]\n"
"trn1 v19.2d, v1.2d, v17.2d\n"
- ".inst 0x6e98a668 // ummla v8.4s, v19.16b, v24.16b\n"
+ ".inst 0x6e97a668 // ummla v8.4s, v19.16b, v23.16b\n"
"ldr q17, [x10, #0x20]\n"
- ".inst 0x6e94a66c // ummla v12.4s, v19.16b, v20.16b\n"
- "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e92a66c // ummla v12.4s, v19.16b, v18.16b\n"
+ "ldr q31, [x10, #0x30]\n"
".inst 0x6e91a669 // ummla v9.4s, v19.16b, v17.16b\n"
"ldr q20, [x10, #0x40]\n"
- ".inst 0x6e80a66d // ummla v13.4s, v19.16b, v0.16b\n"
+ ".inst 0x6e9fa66d // ummla v13.4s, v19.16b, v31.16b\n"
"ldr q17, [x10, #0x50]\n"
".inst 0x6e94a66a // ummla v10.4s, v19.16b, v20.16b\n"
"ldr q18, [x10, #0x60]\n"
".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
+ "add x10, x10, #0x80\n"
"27:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -413,7 +413,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"38:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"39:" // Height 2: Column loop
"tbz %x[flags], #0, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -523,8 +523,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"mov x28, #0x0\n"
"52:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 53f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -549,14 +549,6 @@ void a64_hybrid_u8u32_mmla_6x16 (
"blt 56f\n"
"55:" // Height 2: Multiply loop: Main loop head
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q2, [x25, #0x0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
@@ -569,6 +561,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
@@ -585,21 +578,22 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "sub x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "ldr q2, [x25, #0x0]\n"
+ "cmp x27, #0x20\n"
".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
+ "add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"bge 55b\n"
"56:" // Height 2: Multiply loop: Single iteration only
"trn1 v19.2d, v1.2d, v2.2d\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
"ldr q18, [x10, #0x20]\n"
".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
@@ -612,6 +606,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
"ldr q18, [x10, #0x80]\n"
".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
@@ -628,36 +623,41 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0xe0]\n"
".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
"ldr q17, [x10, #0xf0]\n"
- "add x10, x10, #0x100\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x10, x10, #0x100\n"
"57:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 64f\n"
"cmp x27, #0x8\n"
"blt 59f\n"
"58:" // Height 2: Multiply loop: Odd block loop
- "ldr d20, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x8\n"
- "ldr q18, [x10, #0x0]\n"
- "ldr q17, [x10, #0x10]\n"
- "cmp x27, #0x8\n"
- "trn1 v22.2d, v20.2d, v19.2d\n"
- ".inst 0x6e92a6c8 // ummla v8.4s, v22.16b, v18.16b\n"
- "ldr q2, [x10, #0x20]\n"
- ".inst 0x6e91a6cc // ummla v12.4s, v22.16b, v17.16b\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x10]\n"
+ ".inst 0x6e91a668 // ummla v8.4s, v19.16b, v17.16b\n"
+ ".inst 0x6e96a66c // ummla v12.4s, v19.16b, v22.16b\n"
+ "ldr q1, [x10, #0x20]\n"
"ldr q17, [x10, #0x30]\n"
- ".inst 0x6e82a6c9 // ummla v9.4s, v22.16b, v2.16b\n"
+ ".inst 0x6e81a669 // ummla v9.4s, v19.16b, v1.16b\n"
+ ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
"ldr q18, [x10, #0x40]\n"
- ".inst 0x6e91a6cd // ummla v13.4s, v22.16b, v17.16b\n"
"ldr q17, [x10, #0x50]\n"
- ".inst 0x6e92a6ca // ummla v10.4s, v22.16b, v18.16b\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q18, [x10, #0x60]\n"
- ".inst 0x6e91a6ce // ummla v14.4s, v22.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
+ "cmp x27, #0x8\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e92a6cb // ummla v11.4s, v22.16b, v18.16b\n"
- ".inst 0x6e91a6cf // ummla v15.4s, v22.16b, v17.16b\n"
"bge 58b\n"
"59:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 64f\n"
@@ -703,27 +703,27 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q18, [x10, #0x60]\n"
".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
"ldr q17, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
+ "add x10, x10, #0x80\n"
"64:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 52b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
"cmp x11, #0x10\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
"prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"bge 73f\n"
"tbz x11, #3, 68f\n"
"st1 { v7.4s }, [x9], #0x10\n"
@@ -804,12 +804,12 @@ void a64_hybrid_u8u32_mmla_6x16 (
"75:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"76:" // Height 3: Column loop
"tbz %x[flags], #0, 87f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x23, x24, x20, LSL #2\n"
"bge 85f\n"
"tbz x11, #3, 80f\n"
@@ -951,8 +951,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"mov x28, #0x0\n"
"89:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 90f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -982,38 +982,35 @@ void a64_hybrid_u8u32_mmla_6x16 (
"92:" // Height 3: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
@@ -1021,12 +1018,15 @@ void a64_hybrid_u8u32_mmla_6x16 (
".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xd0]\n"
".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n"
@@ -1048,43 +1048,43 @@ void a64_hybrid_u8u32_mmla_6x16 (
"93:" // Height 3: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
- "trn2 v3.2d, v3.2d, v25.2d\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x90]\n"
".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
@@ -1109,25 +1109,25 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 96f\n"
"95:" // Height 3: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sub x27, x27, #0x8\n"
- "ldr d27, [x24], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
"ldr q26, [x10, #0x0]\n"
- "cmp x27, #0x8\n"
- "ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v27.2d, v29.2d\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x20]\n"
".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n"
".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ "cmp x27, #0x8\n"
".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
@@ -1136,8 +1136,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
@@ -1183,9 +1183,9 @@ void a64_hybrid_u8u32_mmla_6x16 (
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn1 v27.2d, v3.2d, v25.2d\n"
".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
- ".inst 0x6e9da78c // ummla v12.4s, v28.16b, v29.16b\n"
".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e9da78c // ummla v12.4s, v28.16b, v29.16b\n"
".inst 0x6e9da774 // ummla v20.4s, v27.16b, v29.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
@@ -1211,20 +1211,20 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x28, x20\n"
"bne 89b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "cmp x11, #0x10\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
- "uzp2 v9.2d, v9.2d, v13.2d\n"
"prfm pstl1keep, [x9, #0x0]\n"
+ "uzp2 v9.2d, v9.2d, v13.2d\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
@@ -1329,13 +1329,13 @@ void a64_hybrid_u8u32_mmla_6x16 (
"112:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"113:" // Height 4: Column loop
"tbz %x[flags], #0, 124f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x22, x23, x20, LSL #2\n"
"bge 122f\n"
"tbz x11, #3, 117f\n"
@@ -1497,8 +1497,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"mov x28, #0x0\n"
"126:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 127f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1532,38 +1532,33 @@ void a64_hybrid_u8u32_mmla_6x16 (
"129:" // Height 4: Multiply loop: Main loop head
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "ldr q4, [x23, #0x0]\n"
- ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ "add x23, x23, #0x10\n"
+ "ldr q4, [x23, #0x0]\n"
".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
@@ -1574,18 +1569,23 @@ void a64_hybrid_u8u32_mmla_6x16 (
".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xd0]\n"
".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xe0]\n"
".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
@@ -1601,48 +1601,48 @@ void a64_hybrid_u8u32_mmla_6x16 (
"130:" // Height 4: Multiply loop: Single iteration only
"trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v27.2d, v3.2d, v4.2d\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "sub x27, x27, #0x10\n"
- ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
- ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
"ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
"ldr q25, [x10, #0x30]\n"
".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x40]\n"
".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x50]\n"
".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x60]\n"
".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x70]\n"
".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x80]\n"
".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"ldr q25, [x10, #0x90]\n"
".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xa0]\n"
".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
"ldr q25, [x10, #0xb0]\n"
".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
"ldr q26, [x10, #0xc0]\n"
".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
@@ -1664,16 +1664,16 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 133f\n"
"132:" // Height 4: Multiply loop: Odd block loop
- "ldr d30, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x8\n"
"ldr q26, [x10, #0x0]\n"
"ldr q25, [x10, #0x10]\n"
- "trn1 v28.2d, v30.2d, v28.2d\n"
- "trn1 v27.2d, v29.2d, v27.2d\n"
".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
"ldr q26, [x10, #0x20]\n"
@@ -1774,24 +1774,24 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x28, x20\n"
"bne 126b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "prfm pstl1keep, [x24, #0x0]\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
@@ -1918,14 +1918,14 @@ void a64_hybrid_u8u32_mmla_6x16 (
"149:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"150:" // Height 5: Column loop
"tbz %x[flags], #0, 161f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x21, x22, x20, LSL #2\n"
"bge 159f\n"
"tbz x11, #3, 154f\n"
@@ -2123,8 +2123,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"mov x28, #0x0\n"
"163:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 164f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2161,51 +2161,51 @@ void a64_hybrid_u8u32_mmla_6x16 (
"166:" // Height 5: Multiply loop: Main loop head
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "sub x27, x27, #0x10\n"
+ ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
+ "sub x27, x27, #0x10\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
- ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e80a4cc // ummla v12.4s, v6.16b, v0.16b\n"
".inst 0x6e80a454 // ummla v20.4s, v2.16b, v0.16b\n"
+ "add x26, x26, #0x10\n"
".inst 0x6e80a49c // ummla v28.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x6e87a4c9 // ummla v9.4s, v6.16b, v7.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e80a4cd // ummla v13.4s, v6.16b, v0.16b\n"
".inst 0x6e80a455 // ummla v21.4s, v2.16b, v0.16b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x50]\n"
".inst 0x6e87a4ca // ummla v10.4s, v6.16b, v7.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e80a4ce // ummla v14.4s, v6.16b, v0.16b\n"
".inst 0x6e80a456 // ummla v22.4s, v2.16b, v0.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e80a49e // ummla v30.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x70]\n"
".inst 0x6e87a4cb // ummla v11.4s, v6.16b, v7.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e80a4cf // ummla v15.4s, v6.16b, v0.16b\n"
".inst 0x6e80a457 // ummla v23.4s, v2.16b, v0.16b\n"
"ldr q2, [x25, #0x0]\n"
@@ -2251,47 +2251,47 @@ void a64_hybrid_u8u32_mmla_6x16 (
"167:" // Height 5: Multiply loop: Single iteration only
"trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- "add x26, x26, #0x10\n"
+ ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v0.2d\n"
"trn2 v5.2d, v5.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
- ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e80a4cc // ummla v12.4s, v6.16b, v0.16b\n"
".inst 0x6e80a454 // ummla v20.4s, v2.16b, v0.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e80a49c // ummla v28.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x6e87a4c9 // ummla v9.4s, v6.16b, v7.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e80a4cd // ummla v13.4s, v6.16b, v0.16b\n"
".inst 0x6e80a455 // ummla v21.4s, v2.16b, v0.16b\n"
+ "add x22, x22, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x50]\n"
".inst 0x6e87a4ca // ummla v10.4s, v6.16b, v7.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e80a4ce // ummla v14.4s, v6.16b, v0.16b\n"
".inst 0x6e80a456 // ummla v22.4s, v2.16b, v0.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e80a49e // ummla v30.4s, v4.16b, v0.16b\n"
"ldr q0, [x10, #0x70]\n"
".inst 0x6e87a4cb // ummla v11.4s, v6.16b, v7.16b\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
@@ -2335,24 +2335,24 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 170f\n"
"169:" // Height 5: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "cmp x27, #0x8\n"
"ldr d0, [x22], #0x8\n"
"ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v3.2d, v2.2d\n"
- "trn1 v2.2d, v0.2d, v5.2d\n"
- "ldr q0, [x10, #0x10]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
".inst 0x6e81a488 // ummla v8.4s, v4.16b, v1.16b\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x6e81a470 // ummla v16.4s, v3.16b, v1.16b\n"
".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
"ldr q1, [x10, #0x20]\n"
".inst 0x6e80a48c // ummla v12.4s, v4.16b, v0.16b\n"
".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n"
+ "cmp x27, #0x8\n"
".inst 0x6e80a45c // ummla v28.4s, v2.16b, v0.16b\n"
"ldr q0, [x10, #0x30]\n"
".inst 0x6e81a489 // ummla v9.4s, v4.16b, v1.16b\n"
@@ -2371,8 +2371,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n"
".inst 0x6e80a45e // ummla v30.4s, v2.16b, v0.16b\n"
"ldr q0, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e86a48b // ummla v11.4s, v4.16b, v6.16b\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n"
".inst 0x6e86a45b // ummla v27.4s, v2.16b, v6.16b\n"
".inst 0x6e80a48f // ummla v15.4s, v4.16b, v0.16b\n"
@@ -2471,28 +2471,28 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x28, x20\n"
"bne 163b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "cmp x11, #0x10\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "uzp2 v11.2d, v11.2d, v15.2d\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
@@ -2640,20 +2640,19 @@ void a64_hybrid_u8u32_mmla_6x16 (
"b 224f\n"
"186:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"187:" // Height 6: Column loop
"tbz %x[flags], #0, 198f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"add x20, x21, x20, LSL #2\n"
"bge 196f\n"
"tbz x11, #3, 191f\n"
@@ -2871,8 +2870,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"mov x28, #0x0\n"
"200:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 201f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2913,56 +2912,56 @@ void a64_hybrid_u8u32_mmla_6x16 (
"203:" // Height 6: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "add x26, x26, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
+ ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
+ "add x26, x26, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x10, #0x10]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "cmp x27, #0x20\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ "add x25, x25, #0x10\n"
".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x50]\n"
".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ "cmp x27, #0x20\n"
".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x70]\n"
".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
"ldr q2, [x25, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
"ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
@@ -3006,52 +3005,52 @@ void a64_hybrid_u8u32_mmla_6x16 (
"204:" // Height 6: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
+ "add x25, x25, #0x10\n"
"trn1 v4.2d, v5.2d, v6.2d\n"
"trn2 v5.2d, v5.2d, v6.2d\n"
"ldr q6, [x10, #0x10]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "add x21, x21, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ "add x24, x24, #0x10\n"
".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x30]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ "add x23, x23, #0x10\n"
".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ "add x21, x21, #0x10\n"
+ "sub x27, x27, #0x10\n"
".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x50]\n"
".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n"
"ldr q6, [x10, #0x70]\n"
".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
@@ -3092,18 +3091,18 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 207f\n"
"206:" // Height 6: Multiply loop: Odd block loop
- "ldr d6, [x26], #0x8\n"
- "ldr d4, [x25], #0x8\n"
+ "ldr d1, [x26], #0x8\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d5, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x8\n"
- "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x22], #0x8\n"
"ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
"ldr q1, [x10, #0x0]\n"
- "trn1 v4.2d, v6.2d, v4.2d\n"
- "trn1 v3.2d, v5.2d, v3.2d\n"
- "trn1 v2.2d, v2.2d, v0.2d\n"
"ldr q0, [x10, #0x10]\n"
".inst 0x6e81a488 // ummla v8.4s, v4.16b, v1.16b\n"
".inst 0x6e81a470 // ummla v16.4s, v3.16b, v1.16b\n"
@@ -3197,9 +3196,9 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q0, [x10, #0x0]\n"
"trn1 v7.2d, v1.2d, v2.2d\n"
"trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e80a4e8 // ummla v8.4s, v7.16b, v0.16b\n"
"trn1 v2.2d, v5.2d, v6.2d\n"
"ldr q1, [x10, #0x10]\n"
- ".inst 0x6e80a4e8 // ummla v8.4s, v7.16b, v0.16b\n"
".inst 0x6e80a470 // ummla v16.4s, v3.16b, v0.16b\n"
".inst 0x6e80a458 // ummla v24.4s, v2.16b, v0.16b\n"
"ldr q0, [x10, #0x20]\n"
@@ -3223,8 +3222,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
".inst 0x6e81a476 // ummla v22.4s, v3.16b, v1.16b\n"
".inst 0x6e81a45e // ummla v30.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x70]\n"
- "add x10, x10, #0x80\n"
".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
+ "add x10, x10, #0x80\n"
".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n"
".inst 0x6e80a45b // ummla v27.4s, v2.16b, v0.16b\n"
".inst 0x6e86a4ef // ummla v15.4s, v7.16b, v6.16b\n"
@@ -3236,32 +3235,32 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x28, x20\n"
"bne 200b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x11, #0x10\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 v7.2d, v8.2d, v12.2d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
+ "add x20, x21, x20, LSL #2\n"
+ "cmp x11, #0x10\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
- "uzp2 v11.2d, v11.2d, v15.2d\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add x21, x22, x20, LSL #2\n"
+ "uzp2 v11.2d, v11.2d, v15.2d\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
- "uzp2 v16.2d, v16.2d, v20.2d\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add x20, x21, x20, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
@@ -3440,8 +3439,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"224:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 13188f0e4d..0a97c405ac 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -40,8 +40,7 @@ void a64_interleaved_bf16fp32_dot_8x12( ARGLIST );
class cls_a64_interleaved_bf16fp32_dot_8x12
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 2> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index ccfd19db36..7ab854a3fe 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -55,30 +55,30 @@ void a64_interleaved_bf16fp32_dot_8x12(
"ldr q4, [x22, #0x0]\n"
"ldr q5, [x22, #0x10]\n"
"mov %x[Apanel], x21\n"
+ "ldr q0, [%x[Apanel], #0x0]\n"
+ "ldr q1, [%x[Apanel], #0x10]\n"
+ "movi v8.16b, #0x0\n"
"ldr q6, [x22, #0x20]\n"
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
- "movi v8.16b, #0x0\n"
+ "cmp x20, #0x2\n"
"movi v9.16b, #0x0\n"
+ "prfm pldl1keep, [%x[Apanel], #0x0]\n"
"movi v10.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
"movi v11.16b, #0x0\n"
- "cmp x20, #0x2\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
"movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
"movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
- "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [%x[Apanel], #0x40]\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
- "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"movi v18.16b, #0x0\n"
"movi v19.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
"movi v20.16b, #0x0\n"
"movi v21.16b, #0x0\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"movi v22.16b, #0x0\n"
"movi v23.16b, #0x0\n"
"movi v24.16b, #0x0\n"
@@ -159,9 +159,9 @@ void a64_interleaved_bf16fp32_dot_8x12(
"bge 3b\n"
"4:" // main loop skip
"add %x[Apanel], %x[Apanel], #0x20\n"
- "add x22, x22, #0x30\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
".inst 0x4f60f08b // bfdot v11.4s, v4.8h, v0.h[1]\n"
+ "add x22, x22, #0x30\n"
".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n"
".inst 0x4f60f891 // bfdot v17.4s, v4.8h, v0.h[3]\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
@@ -190,13 +190,13 @@ void a64_interleaved_bf16fp32_dot_8x12(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ldr q2, [x22, #0x0]\n"
"ldr q1, [x22, #0x10]\n"
- "ldr q0, [x22, #0x20]\n"
- "add x22, x22, #0x30\n"
".inst 0x4f44f048 // bfdot v8.4s, v2.8h, v4.h[0]\n"
+ "ldr q0, [x22, #0x20]\n"
".inst 0x4f64f04b // bfdot v11.4s, v2.8h, v4.h[1]\n"
".inst 0x4f44f84e // bfdot v14.4s, v2.8h, v4.h[2]\n"
".inst 0x4f64f851 // bfdot v17.4s, v2.8h, v4.h[3]\n"
".inst 0x4f43f054 // bfdot v20.4s, v2.8h, v3.h[0]\n"
+ "add x22, x22, #0x30\n"
".inst 0x4f63f057 // bfdot v23.4s, v2.8h, v3.h[1]\n"
".inst 0x4f43f85a // bfdot v26.4s, v2.8h, v3.h[2]\n"
".inst 0x4f63f85d // bfdot v29.4s, v2.8h, v3.h[3]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 7a270b6082..0a46f26c55 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -41,8 +41,7 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_bf16fp32_mmla_8x12
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -64,8 +63,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
index 1513d378ca..6eaac71e5f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -54,18 +54,18 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
"2:" // Width loop
"ldp q4, q5, [x22], #0x20\n"
"mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
"movi v8.16b, #0x0\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
- "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
"movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
- "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
- "cmp x20, #0x2\n"
"movi v14.16b, #0x0\n"
- "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
"movi v15.16b, #0x0\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
@@ -97,7 +97,7 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n"
"cmp x20, #0x2\n"
".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n"
- "ldp q5, q4, [x22], #0x20\n"
+ "ldp q4, q5, [x22], #0x20\n"
".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e43ec2f // bfmmla v15.4s, v1.8h, v3.8h\n"
@@ -106,28 +106,28 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
".inst 0x6e43ecdb // bfmmla v27.4s, v6.8h, v3.8h\n"
".inst 0x6e47ecde // bfmmla v30.4s, v6.8h, v7.8h\n"
- "ldp q3, q7, [x22], #0x20\n"
- ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
- ".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n"
+ "ldp q7, q3, [x22], #0x20\n"
+ ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
"ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n"
- ".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n"
- ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
"ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ecdc // bfmmla v28.4s, v6.8h, v5.8h\n"
+ ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
"ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e44ecdf // bfmmla v31.4s, v6.8h, v4.8h\n"
- ".inst 0x6e43ec08 // bfmmla v8.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n"
+ ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n"
"ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e43ec2e // bfmmla v14.4s, v1.8h, v3.8h\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
- ".inst 0x6e43ec54 // bfmmla v20.4s, v2.8h, v3.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e43ecda // bfmmla v26.4s, v6.8h, v3.8h\n"
- ".inst 0x6e47ecdd // bfmmla v29.4s, v6.8h, v7.8h\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n"
+ ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n"
"ldp q7, q3, [x22], #0x20\n"
".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
@@ -143,11 +143,11 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
"ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n"
".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
"ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
- ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n"
"ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n"
".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n"
"bge 3b\n"
"4:" // main loop skip
@@ -182,9 +182,9 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
"ldp q1, q0, [x22], #0x20\n"
"ld1 { v7.8h }, [%x[Apanel]], #0x10\n"
"ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
"ld1 { v5.8h }, [%x[Apanel]], #0x10\n"
"ld1 { v4.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
"ldp q3, q2, [x22], #0x20\n"
".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
@@ -212,41 +212,41 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index f4493c6855..8360c9691b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -55,21 +55,21 @@ void a64_interleaved_bf16fp32_mmla_8x12(
"ldr q4, [x22, #0x0]\n"
"ldr q5, [x22, #0x10]\n"
"mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "ldr q0, [%x[Apanel], #0x0]\n"
+ "ldr q1, [%x[Apanel], #0x10]\n"
"movi v8.16b, #0x0\n"
+ "ldr q2, [%x[Apanel], #0x20]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
- "movi v11.16b, #0x0\n"
"add x22, x22, #0x20\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "movi v11.16b, #0x0\n"
"movi v12.16b, #0x0\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "cmp x20, #0x2\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"movi v16.16b, #0x0\n"
"movi v17.16b, #0x0\n"
"movi v18.16b, #0x0\n"
@@ -196,19 +196,19 @@ void a64_interleaved_bf16fp32_mmla_8x12(
"cbz x20, 5f\n"
"ldr q1, [x22, #0x0]\n"
"ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
"ldr q6, [%x[Apanel], #0x10]\n"
"ldr q0, [x22, #0x10]\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
"ldr q5, [%x[Apanel], #0x20]\n"
"ldr q4, [%x[Apanel], #0x30]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
+ ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
"ldr q3, [x22, #0x20]\n"
"ldr q2, [x22, #0x30]\n"
- ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
- ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
- ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n"
".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n"
".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n"
"ldr q1, [x22, #0x40]\n"
".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
@@ -216,8 +216,8 @@ void a64_interleaved_bf16fp32_mmla_8x12(
".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n"
".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n"
".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n"
- "add x22, x22, #0x60\n"
".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n"
+ "add x22, x22, #0x60\n"
".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n"
".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n"
".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
@@ -232,41 +232,41 @@ void a64_interleaved_bf16fp32_mmla_8x12(
".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 7c26bfa682..94b5bcc0a8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -40,8 +40,7 @@ void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_s8s32_mmla_8x12
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 8> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
index ba169540c2..2cd659d033 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -54,18 +54,18 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
"2:" // Width loop
"ldp q4, q5, [x22], #0x20\n"
"mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
"movi v8.4s, #0x0\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"movi v11.4s, #0x0\n"
- "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
"movi v12.4s, #0x0\n"
"movi v13.4s, #0x0\n"
- "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- "cmp x20, #0x2\n"
"movi v14.4s, #0x0\n"
- "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
"movi v15.4s, #0x0\n"
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -97,7 +97,7 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
".inst 0x4e84a4da // smmla v26.4s, v6.16b, v4.16b\n"
"cmp x20, #0x2\n"
".inst 0x4e85a4dd // smmla v29.4s, v6.16b, v5.16b\n"
- "ldp q5, q4, [x22], #0x20\n"
+ "ldp q4, q5, [x22], #0x20\n"
".inst 0x4e83a409 // smmla v9.4s, v0.16b, v3.16b\n"
".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
".inst 0x4e83a42f // smmla v15.4s, v1.16b, v3.16b\n"
@@ -106,28 +106,28 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
".inst 0x4e83a4db // smmla v27.4s, v6.16b, v3.16b\n"
".inst 0x4e87a4de // smmla v30.4s, v6.16b, v7.16b\n"
- "ldp q3, q7, [x22], #0x20\n"
- ".inst 0x4e85a40a // smmla v10.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84a40d // smmla v13.4s, v0.16b, v4.16b\n"
+ "ldp q7, q3, [x22], #0x20\n"
+ ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e85a430 // smmla v16.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84a433 // smmla v19.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a456 // smmla v22.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
"ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e84a459 // smmla v25.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85a4dc // smmla v28.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e84a4df // smmla v31.4s, v6.16b, v4.16b\n"
- ".inst 0x4e83a408 // smmla v8.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e84a4dc // smmla v28.4s, v6.16b, v4.16b\n"
+ ".inst 0x4e85a4df // smmla v31.4s, v6.16b, v5.16b\n"
"ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x4e83a42e // smmla v14.4s, v1.16b, v3.16b\n"
- ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x4e83a454 // smmla v20.4s, v2.16b, v3.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e83a4da // smmla v26.4s, v6.16b, v3.16b\n"
- ".inst 0x4e87a4dd // smmla v29.4s, v6.16b, v7.16b\n"
+ ".inst 0x4e83a40b // smmla v11.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e87a42e // smmla v14.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e83a431 // smmla v17.4s, v1.16b, v3.16b\n"
+ ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e83a457 // smmla v23.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e87a4da // smmla v26.4s, v6.16b, v7.16b\n"
+ ".inst 0x4e83a4dd // smmla v29.4s, v6.16b, v3.16b\n"
"ldp q7, q3, [x22], #0x20\n"
".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
@@ -143,11 +143,11 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
".inst 0x4e87a430 // smmla v16.4s, v1.16b, v7.16b\n"
".inst 0x4e83a433 // smmla v19.4s, v1.16b, v3.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
"ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
".inst 0x4e83a459 // smmla v25.4s, v2.16b, v3.16b\n"
- ".inst 0x4e87a4dc // smmla v28.4s, v6.16b, v7.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a4dc // smmla v28.4s, v6.16b, v7.16b\n"
".inst 0x4e83a4df // smmla v31.4s, v6.16b, v3.16b\n"
"bge 3b\n"
"4:" // main loop skip
@@ -182,9 +182,9 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
"ldp q1, q0, [x22], #0x20\n"
"ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
"ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n"
"ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
"ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n"
".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
"ldp q3, q2, [x22], #0x20\n"
".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n"
@@ -212,41 +212,41 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 63c6277719..a0ada9f949 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -55,21 +55,21 @@ void a64_interleaved_s8s32_mmla_8x12(
"ldr q4, [x22, #0x0]\n"
"ldr q5, [x22, #0x10]\n"
"mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "ldr q0, [%x[Apanel], #0x0]\n"
+ "ldr q1, [%x[Apanel], #0x10]\n"
"movi v8.4s, #0x0\n"
+ "ldr q2, [%x[Apanel], #0x20]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
"add x22, x22, #0x20\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "cmp x20, #0x2\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
"movi v15.4s, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -196,19 +196,19 @@ void a64_interleaved_s8s32_mmla_8x12(
"cbz x20, 5f\n"
"ldr q1, [x22, #0x0]\n"
"ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n"
"ldr q6, [%x[Apanel], #0x10]\n"
"ldr q0, [x22, #0x10]\n"
+ ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
"ldr q5, [%x[Apanel], #0x20]\n"
"ldr q4, [%x[Apanel], #0x30]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
+ ".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n"
"ldr q3, [x22, #0x20]\n"
"ldr q2, [x22, #0x30]\n"
- ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n"
- ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
- ".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n"
".inst 0x4e80a4d1 // smmla v17.4s, v6.16b, v0.16b\n"
".inst 0x4e81a4b4 // smmla v20.4s, v5.16b, v1.16b\n"
".inst 0x4e80a4b7 // smmla v23.4s, v5.16b, v0.16b\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x4e81a49a // smmla v26.4s, v4.16b, v1.16b\n"
"ldr q1, [x22, #0x40]\n"
".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
@@ -216,8 +216,8 @@ void a64_interleaved_s8s32_mmla_8x12(
".inst 0x4e83a4e9 // smmla v9.4s, v7.16b, v3.16b\n"
".inst 0x4e82a4ec // smmla v12.4s, v7.16b, v2.16b\n"
".inst 0x4e83a4cf // smmla v15.4s, v6.16b, v3.16b\n"
- "add x22, x22, #0x60\n"
".inst 0x4e82a4d2 // smmla v18.4s, v6.16b, v2.16b\n"
+ "add x22, x22, #0x60\n"
".inst 0x4e83a4b5 // smmla v21.4s, v5.16b, v3.16b\n"
".inst 0x4e82a4b8 // smmla v24.4s, v5.16b, v2.16b\n"
".inst 0x4e83a49b // smmla v27.4s, v4.16b, v3.16b\n"
@@ -232,41 +232,41 @@ void a64_interleaved_s8s32_mmla_8x12(
".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12.hpp
deleted file mode 100644
index c22303ce06..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef __aarch64__
-
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const uint8_t *, const int8_t *, \
- int32_t *, int, int, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_interleaved_u8s8s32_mmla_8x12( ARGLIST );
-
-class cls_a64_interleaved_u8s8s32_mmla_8x12
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef int32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return 12;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 8;
- }
-
-
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 8> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, uint32_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 62.58, 4.06, 8.02 };
- case CPUModel::A510:
- return { 47.83, 3.59, 3.72 };
- case CPUModel::V1:
- return { 111.52, 4.97, 10.80 };
- }
- }
-
-
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 62.57, 4.10, 0.51 };
- case CPUModel::A510:
- return { 47.66, 2.47, 0.29 };
- case CPUModel::V1:
- return { 75.54, 8.06, 0.63 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_interleaved_u8s8s32_mmla_8x12;
- cls_a64_interleaved_u8s8s32_mmla_8x12(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12/generic.cpp
deleted file mode 100644
index b7f2e0c04b..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8s8s32_mmla_8x12/generic.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_u8s8s32_mmla_8x12(
- const uint8_t *Apanel,
- const int8_t *Bpanel,
- int32_t *Cpanel,
- int ablocks,
- int bblocks,
- int K) {
-
- struct KernelArgs {
- size_t K = {};
- const int8_t *Bpanel = {};
- size_t bblocks = {};
- } ka;
-
- ka.K = (K/8) - 1;
- ka.Bpanel = Bpanel;
- ka.bblocks = bblocks;
-
- __asm__ __volatile__(
- "1:" // Height loop
- "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "mov x21, %x[Apanel]\n"
- "2:" // Width loop
- "ldr q4, [x22, #0x0]\n"
- "ldr q5, [x22, #0x10]\n"
- "mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "add x22, x22, #0x20\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v12.4s, #0x0\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "cmp x20, #0x2\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- "ldr q6, [%x[Apanel], #0x0]\n"
- "ldr q7, [x22, #0x0]\n"
- ".inst 0x4e84ac08 // usmmla v8.4s, v0.16b, v4.16b\n"
- "ldr q3, [x22, #0x10]\n"
- ".inst 0x4e85ac0b // usmmla v11.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84ac2e // usmmla v14.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85ac31 // usmmla v17.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84ac54 // usmmla v20.4s, v2.16b, v4.16b\n"
- "sub x20, x20, #0x2\n"
- ".inst 0x4e85ac57 // usmmla v23.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84acda // usmmla v26.4s, v6.16b, v4.16b\n"
- "ldr q4, [x22, #0x20]\n"
- ".inst 0x4e85acdd // usmmla v29.4s, v6.16b, v5.16b\n"
- "ldr q5, [x22, #0x30]\n"
- ".inst 0x4e87ac09 // usmmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e83ac0c // usmmla v12.4s, v0.16b, v3.16b\n"
- ".inst 0x4e87ac2f // usmmla v15.4s, v1.16b, v7.16b\n"
- "cmp x20, #0x2\n"
- ".inst 0x4e83ac32 // usmmla v18.4s, v1.16b, v3.16b\n"
- ".inst 0x4e87ac55 // usmmla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x4e83ac58 // usmmla v24.4s, v2.16b, v3.16b\n"
- ".inst 0x4e87acdb // usmmla v27.4s, v6.16b, v7.16b\n"
- "ldr q7, [x22, #0x40]\n"
- ".inst 0x4e83acde // usmmla v30.4s, v6.16b, v3.16b\n"
- "ldr q3, [x22, #0x50]\n"
- ".inst 0x4e84ac0a // usmmla v10.4s, v0.16b, v4.16b\n"
- ".inst 0x4e85ac0d // usmmla v13.4s, v0.16b, v5.16b\n"
- "ldr q0, [%x[Apanel], #0x10]\n"
- ".inst 0x4e84ac30 // usmmla v16.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n"
- "ldr q1, [%x[Apanel], #0x20]\n"
- ".inst 0x4e84ac56 // usmmla v22.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85ac59 // usmmla v25.4s, v2.16b, v5.16b\n"
- "ldr q2, [%x[Apanel], #0x30]\n"
- ".inst 0x4e84acdc // usmmla v28.4s, v6.16b, v4.16b\n"
- "ldr q4, [x22, #0x60]\n"
- ".inst 0x4e85acdf // usmmla v31.4s, v6.16b, v5.16b\n"
- "ldr q6, [%x[Apanel], #0x40]\n"
- "ldr q5, [x22, #0x70]\n"
- ".inst 0x4e87ac08 // usmmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e83ac0b // usmmla v11.4s, v0.16b, v3.16b\n"
- ".inst 0x4e87ac2e // usmmla v14.4s, v1.16b, v7.16b\n"
- ".inst 0x4e83ac31 // usmmla v17.4s, v1.16b, v3.16b\n"
- ".inst 0x4e87ac54 // usmmla v20.4s, v2.16b, v7.16b\n"
- ".inst 0x4e83ac57 // usmmla v23.4s, v2.16b, v3.16b\n"
- ".inst 0x4e87acda // usmmla v26.4s, v6.16b, v7.16b\n"
- "ldr q7, [x22, #0x80]\n"
- ".inst 0x4e83acdd // usmmla v29.4s, v6.16b, v3.16b\n"
- "ldr q3, [x22, #0x90]\n"
- ".inst 0x4e84ac09 // usmmla v9.4s, v0.16b, v4.16b\n"
- ".inst 0x4e85ac0c // usmmla v12.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84ac2f // usmmla v15.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85ac32 // usmmla v18.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84ac55 // usmmla v21.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84acdb // usmmla v27.4s, v6.16b, v4.16b\n"
- "ldr q4, [x22, #0xa0]\n"
- ".inst 0x4e85acde // usmmla v30.4s, v6.16b, v5.16b\n"
- "ldr q5, [x22, #0xb0]\n"
- ".inst 0x4e87ac0a // usmmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e83ac0d // usmmla v13.4s, v0.16b, v3.16b\n"
- "ldr q0, [%x[Apanel], #0x50]\n"
- ".inst 0x4e87ac30 // usmmla v16.4s, v1.16b, v7.16b\n"
- ".inst 0x4e83ac33 // usmmla v19.4s, v1.16b, v3.16b\n"
- "ldr q1, [%x[Apanel], #0x60]\n"
- ".inst 0x4e87ac56 // usmmla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x4e83ac59 // usmmla v25.4s, v2.16b, v3.16b\n"
- "ldr q2, [%x[Apanel], #0x70]\n"
- ".inst 0x4e87acdc // usmmla v28.4s, v6.16b, v7.16b\n"
- ".inst 0x4e83acdf // usmmla v31.4s, v6.16b, v3.16b\n"
- "add %x[Apanel], %x[Apanel], #0x80\n"
- "add x22, x22, #0xc0\n"
- "bge 3b\n"
- "4:" // main loop skip
- "ldr q3, [%x[Apanel], #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
- ".inst 0x4e84ac08 // usmmla v8.4s, v0.16b, v4.16b\n"
- "ldr q7, [x22, #0x10]\n"
- ".inst 0x4e85ac0b // usmmla v11.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84ac2e // usmmla v14.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85ac31 // usmmla v17.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84ac54 // usmmla v20.4s, v2.16b, v4.16b\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- ".inst 0x4e85ac57 // usmmla v23.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84ac7a // usmmla v26.4s, v3.16b, v4.16b\n"
- "ldr q4, [x22, #0x20]\n"
- ".inst 0x4e85ac7d // usmmla v29.4s, v3.16b, v5.16b\n"
- "ldr q5, [x22, #0x30]\n"
- ".inst 0x4e86ac09 // usmmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87ac0c // usmmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86ac2f // usmmla v15.4s, v1.16b, v6.16b\n"
- "add x22, x22, #0x40\n"
- ".inst 0x4e87ac32 // usmmla v18.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86ac55 // usmmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87ac58 // usmmla v24.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86ac7b // usmmla v27.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87ac7e // usmmla v30.4s, v3.16b, v7.16b\n"
- ".inst 0x4e84ac0a // usmmla v10.4s, v0.16b, v4.16b\n"
- ".inst 0x4e85ac0d // usmmla v13.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84ac30 // usmmla v16.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84ac56 // usmmla v22.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85ac59 // usmmla v25.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84ac7c // usmmla v28.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85ac7f // usmmla v31.4s, v3.16b, v5.16b\n"
- "cbz x20, 5f\n"
- "ldr q1, [x22, #0x0]\n"
- "ldr q7, [%x[Apanel], #0x0]\n"
- "ldr q6, [%x[Apanel], #0x10]\n"
- "ldr q0, [x22, #0x10]\n"
- "ldr q5, [%x[Apanel], #0x20]\n"
- "ldr q4, [%x[Apanel], #0x30]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- "ldr q3, [x22, #0x20]\n"
- "ldr q2, [x22, #0x30]\n"
- ".inst 0x4e81ace8 // usmmla v8.4s, v7.16b, v1.16b\n"
- ".inst 0x4e80aceb // usmmla v11.4s, v7.16b, v0.16b\n"
- ".inst 0x4e81acce // usmmla v14.4s, v6.16b, v1.16b\n"
- ".inst 0x4e80acd1 // usmmla v17.4s, v6.16b, v0.16b\n"
- ".inst 0x4e81acb4 // usmmla v20.4s, v5.16b, v1.16b\n"
- ".inst 0x4e80acb7 // usmmla v23.4s, v5.16b, v0.16b\n"
- ".inst 0x4e81ac9a // usmmla v26.4s, v4.16b, v1.16b\n"
- "ldr q1, [x22, #0x40]\n"
- ".inst 0x4e80ac9d // usmmla v29.4s, v4.16b, v0.16b\n"
- "ldr q0, [x22, #0x50]\n"
- ".inst 0x4e83ace9 // usmmla v9.4s, v7.16b, v3.16b\n"
- ".inst 0x4e82acec // usmmla v12.4s, v7.16b, v2.16b\n"
- ".inst 0x4e83accf // usmmla v15.4s, v6.16b, v3.16b\n"
- "add x22, x22, #0x60\n"
- ".inst 0x4e82acd2 // usmmla v18.4s, v6.16b, v2.16b\n"
- ".inst 0x4e83acb5 // usmmla v21.4s, v5.16b, v3.16b\n"
- ".inst 0x4e82acb8 // usmmla v24.4s, v5.16b, v2.16b\n"
- ".inst 0x4e83ac9b // usmmla v27.4s, v4.16b, v3.16b\n"
- ".inst 0x4e82ac9e // usmmla v30.4s, v4.16b, v2.16b\n"
- ".inst 0x4e81acea // usmmla v10.4s, v7.16b, v1.16b\n"
- ".inst 0x4e80aced // usmmla v13.4s, v7.16b, v0.16b\n"
- ".inst 0x4e81acd0 // usmmla v16.4s, v6.16b, v1.16b\n"
- ".inst 0x4e80acd3 // usmmla v19.4s, v6.16b, v0.16b\n"
- ".inst 0x4e81acb6 // usmmla v22.4s, v5.16b, v1.16b\n"
- ".inst 0x4e80acb9 // usmmla v25.4s, v5.16b, v0.16b\n"
- ".inst 0x4e81ac9c // usmmla v28.4s, v4.16b, v1.16b\n"
- ".inst 0x4e80ac9f // usmmla v31.4s, v4.16b, v0.16b\n"
- "5:" // multiply loop done
- "subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
- "uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v1.2d, v9.2d, v12.2d\n"
- "uzp2 v9.2d, v9.2d, v12.2d\n"
- "uzp1 v0.2d, v10.2d, v13.2d\n"
- "uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
- "str q1, [%x[Cpanel], #0x10]\n"
- "uzp1 v2.2d, v15.2d, v18.2d\n"
- "uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
- "uzp1 v17.2d, v16.2d, v19.2d\n"
- "uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v1.2d, v20.2d, v23.2d\n"
- "uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v0.2d, v21.2d, v24.2d\n"
- "uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v23.2d, v22.2d, v25.2d\n"
- "uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
- "uzp1 v19.2d, v26.2d, v29.2d\n"
- "uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
- "uzp1 v18.2d, v27.2d, v30.2d\n"
- "uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
- "uzp1 v17.2d, v28.2d, v31.2d\n"
- "uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
- "str q16, [%x[Cpanel], #0xb0]\n"
- "str q1, [%x[Cpanel], #0xc0]\n"
- "str q0, [%x[Cpanel], #0xd0]\n"
- "str q23, [%x[Cpanel], #0xe0]\n"
- "str q20, [%x[Cpanel], #0xf0]\n"
- "str q21, [%x[Cpanel], #0x100]\n"
- "str q22, [%x[Cpanel], #0x110]\n"
- "str q19, [%x[Cpanel], #0x120]\n"
- "str q18, [%x[Cpanel], #0x130]\n"
- "str q17, [%x[Cpanel], #0x140]\n"
- "str q26, [%x[Cpanel], #0x150]\n"
- "str q27, [%x[Cpanel], #0x160]\n"
- "str q28, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index 922438a255..1073d15f01 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -40,8 +40,7 @@ void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_u8u32_mmla_8x12
{
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 8> transforms = {};
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
index 8affb0ea86..741fa6ac08 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -54,18 +54,18 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
"2:" // Width loop
"ldp q4, q5, [x22], #0x20\n"
"mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
"movi v8.4s, #0x0\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"movi v11.4s, #0x0\n"
- "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
"movi v12.4s, #0x0\n"
"movi v13.4s, #0x0\n"
- "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- "cmp x20, #0x2\n"
"movi v14.4s, #0x0\n"
- "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
"movi v15.4s, #0x0\n"
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -97,7 +97,7 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
".inst 0x6e84a4da // ummla v26.4s, v6.16b, v4.16b\n"
"cmp x20, #0x2\n"
".inst 0x6e85a4dd // ummla v29.4s, v6.16b, v5.16b\n"
- "ldp q5, q4, [x22], #0x20\n"
+ "ldp q4, q5, [x22], #0x20\n"
".inst 0x6e83a409 // ummla v9.4s, v0.16b, v3.16b\n"
".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
".inst 0x6e83a42f // ummla v15.4s, v1.16b, v3.16b\n"
@@ -106,28 +106,28 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
".inst 0x6e83a4db // ummla v27.4s, v6.16b, v3.16b\n"
".inst 0x6e87a4de // ummla v30.4s, v6.16b, v7.16b\n"
- "ldp q3, q7, [x22], #0x20\n"
- ".inst 0x6e85a40a // ummla v10.4s, v0.16b, v5.16b\n"
- ".inst 0x6e84a40d // ummla v13.4s, v0.16b, v4.16b\n"
+ "ldp q7, q3, [x22], #0x20\n"
+ ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e85a430 // ummla v16.4s, v1.16b, v5.16b\n"
- ".inst 0x6e84a433 // ummla v19.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a456 // ummla v22.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
"ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e84a459 // ummla v25.4s, v2.16b, v4.16b\n"
- ".inst 0x6e85a4dc // ummla v28.4s, v6.16b, v5.16b\n"
+ ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e84a4df // ummla v31.4s, v6.16b, v4.16b\n"
- ".inst 0x6e83a408 // ummla v8.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e84a4dc // ummla v28.4s, v6.16b, v4.16b\n"
+ ".inst 0x6e85a4df // ummla v31.4s, v6.16b, v5.16b\n"
"ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e83a42e // ummla v14.4s, v1.16b, v3.16b\n"
- ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x6e83a454 // ummla v20.4s, v2.16b, v3.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x6e83a4da // ummla v26.4s, v6.16b, v3.16b\n"
- ".inst 0x6e87a4dd // ummla v29.4s, v6.16b, v7.16b\n"
+ ".inst 0x6e83a40b // ummla v11.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e87a42e // ummla v14.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e83a431 // ummla v17.4s, v1.16b, v3.16b\n"
+ ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e83a457 // ummla v23.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e87a4da // ummla v26.4s, v6.16b, v7.16b\n"
+ ".inst 0x6e83a4dd // ummla v29.4s, v6.16b, v3.16b\n"
"ldp q7, q3, [x22], #0x20\n"
".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
@@ -143,11 +143,11 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
".inst 0x6e87a430 // ummla v16.4s, v1.16b, v7.16b\n"
".inst 0x6e83a433 // ummla v19.4s, v1.16b, v3.16b\n"
- ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
"ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
".inst 0x6e83a459 // ummla v25.4s, v2.16b, v3.16b\n"
- ".inst 0x6e87a4dc // ummla v28.4s, v6.16b, v7.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a4dc // ummla v28.4s, v6.16b, v7.16b\n"
".inst 0x6e83a4df // ummla v31.4s, v6.16b, v3.16b\n"
"bge 3b\n"
"4:" // main loop skip
@@ -182,9 +182,9 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
"ldp q1, q0, [x22], #0x20\n"
"ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
"ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n"
"ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
"ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n"
".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
"ldp q3, q2, [x22], #0x20\n"
".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n"
@@ -212,41 +212,41 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index 747572ef84..613c3f09e5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -55,21 +55,21 @@ void a64_interleaved_u8u32_mmla_8x12(
"ldr q4, [x22, #0x0]\n"
"ldr q5, [x22, #0x10]\n"
"mov %x[Apanel], x21\n"
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "ldr q0, [%x[Apanel], #0x0]\n"
+ "ldr q1, [%x[Apanel], #0x10]\n"
"movi v8.4s, #0x0\n"
+ "ldr q2, [%x[Apanel], #0x20]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
"add x22, x22, #0x20\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "cmp x20, #0x2\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
"movi v15.4s, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -196,19 +196,19 @@ void a64_interleaved_u8u32_mmla_8x12(
"cbz x20, 5f\n"
"ldr q1, [x22, #0x0]\n"
"ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n"
"ldr q6, [%x[Apanel], #0x10]\n"
"ldr q0, [x22, #0x10]\n"
+ ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
"ldr q5, [%x[Apanel], #0x20]\n"
"ldr q4, [%x[Apanel], #0x30]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
+ ".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n"
"ldr q3, [x22, #0x20]\n"
"ldr q2, [x22, #0x30]\n"
- ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n"
- ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
- ".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n"
".inst 0x6e80a4d1 // ummla v17.4s, v6.16b, v0.16b\n"
".inst 0x6e81a4b4 // ummla v20.4s, v5.16b, v1.16b\n"
".inst 0x6e80a4b7 // ummla v23.4s, v5.16b, v0.16b\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x6e81a49a // ummla v26.4s, v4.16b, v1.16b\n"
"ldr q1, [x22, #0x40]\n"
".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
@@ -216,8 +216,8 @@ void a64_interleaved_u8u32_mmla_8x12(
".inst 0x6e83a4e9 // ummla v9.4s, v7.16b, v3.16b\n"
".inst 0x6e82a4ec // ummla v12.4s, v7.16b, v2.16b\n"
".inst 0x6e83a4cf // ummla v15.4s, v6.16b, v3.16b\n"
- "add x22, x22, #0x60\n"
".inst 0x6e82a4d2 // ummla v18.4s, v6.16b, v2.16b\n"
+ "add x22, x22, #0x60\n"
".inst 0x6e83a4b5 // ummla v21.4s, v5.16b, v3.16b\n"
".inst 0x6e82a4b8 // ummla v24.4s, v5.16b, v2.16b\n"
".inst 0x6e83a49b // ummla v27.4s, v4.16b, v3.16b\n"
@@ -232,41 +232,41 @@ void a64_interleaved_u8u32_mmla_8x12(
".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v2.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
"uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
"uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q2, [%x[Cpanel], #0x0]\n"
- "uzp1 v3.2d, v14.2d, v17.2d\n"
- "uzp2 v14.2d, v14.2d, v17.2d\n"
"str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
"uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
- "str q0, [%x[Cpanel], #0x20]\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
"uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
- "str q8, [%x[Cpanel], #0x30]\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
"uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q9, [%x[Cpanel], #0x40]\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
"uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q10, [%x[Cpanel], #0x50]\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
"uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q3, [%x[Cpanel], #0x60]\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
"uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
- "str q2, [%x[Cpanel], #0x70]\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
"uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
- "str q17, [%x[Cpanel], #0x80]\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
"uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
- "str q14, [%x[Cpanel], #0x90]\n"
- "str q15, [%x[Cpanel], #0xa0]\n"
"str q16, [%x[Cpanel], #0xb0]\n"
"str q1, [%x[Cpanel], #0xc0]\n"
"str q0, [%x[Cpanel], #0xd0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index 4707a17adb..19acfe8ae9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -49,8 +49,7 @@ void a64_sgemm_asimd_8x12_x1(const float *, const float *, float *, int, int, in
// structure.
class cls_a64_sgemm_8x12 {
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
@@ -69,7 +68,7 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixedTRB<lhs_operand_type, rhs_operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixedTRB<operand_type, result_type, 8, 12> transforms = {};
template<typename T>
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
index b9a2a3a3ef..00ec904e51 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
@@ -54,312 +54,312 @@ void a64_sgemm_asimd_8x12_a53(const float *Apanel, const float *Bpanel, float *C
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
"1:\n"
// Unroll 0
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
"nop\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
ASM_PREFETCH("[%[a_ptr], #320]")
- "ins %[b0].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
"nop\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
"nop\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
ASM_PREFETCH("[%[b_ptr], #512]")
- "ins %[b1].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
// Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
"nop\n"
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
- "ldr %d[a0], [%[a_ptr], #64]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
-
- "ldr %d[a1], [%[a_ptr], #80]\n"
- "ins %[a0].d[1], x20\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
-
- "ldr %d[b0], [%[b_ptr], #96]\n"
- "ins %[a1].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
"nop\n"
- "ins %[b0].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
"nop\n"
"nop\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
"nop\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
"nop\n"
- "ins %[b1].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "bne 1b\n"
+ "bne 1b\n"
// Branch here if K=1 or 2. Do the right thing for odd/even at the end.
"4:\n"
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration. (even K)
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
"nop\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
-
- "ins %[b0].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
"nop\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
"nop\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
"nop\n"
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "b 3f\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
// Detached final iteration. (odd K)
"2:\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
"nop\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
-
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
// Common tail
"3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
index 4303d1346c..de85605561 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
@@ -54,313 +54,313 @@ void a64_sgemm_asimd_8x12_a55(const float *Apanel, const float *Bpanel, float *C
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
"1:\n"
// Unroll 0
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "subs %w[k], %w[k], #1\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "subs %w[k], %w[k], #1\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
ASM_PREFETCH("[%[a_ptr], #320]")
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
-
- "ldr %d[b1], [%[b_ptr], #64]\n"
- "ins %[b0].d[1], x20\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
ASM_PREFETCH("[%[b_ptr], #512]")
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
// Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
- "ins %[b1].d[1], x20\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ins %[b1].d[1], x20\n"
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
- "ins %[b2].d[1], x20\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
- "ins %[a0].d[1], x20\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
- "ins %[a1].d[1], x20\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
- "ins %[b0].d[1], x20\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+ "ins %[b0].d[1], x20\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "ins %[b1].d[1], x20\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "ins %[b1].d[1], x20\n"
- "bne 1b\n"
+ "bne 1b\n"
// Branch here if K=1 or 2. Do the right thing for odd/even at the end.
"4:\n"
- "cbnz %w[oddk], 2f\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "cbnz %w[oddk], 2f\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
// Detached final iteration. (even K)
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
-
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
-
- "ldr %d[b1], [%[b_ptr], #64]\n"
- "ins %[b0].d[1], x20\n"
-
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- "ldr %d[b2], [%[b_ptr], #80]\n"
- "ins %[b1].d[1], x20\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "b 3f\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
// Detached final iteration. (odd K)
"2:\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
-
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
// Common tail
"3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
index fdbbe6b749..928b22a190 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
@@ -100,275 +100,275 @@ void a64_sgemm_asimd_8x12_a55r1(const float *Apanel, const float *Bpanel, float
// The loop is offset by these two instructions which must
// always be executed.
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
"1:\n"
// Unroll 0
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
ASM_PREFETCH("[%[a_ptr], #448]")
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
ASM_PREFETCH("[%[b_ptr], #576]")
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
// Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "ins %[a0].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
-
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "ins %[a1].d[1], x20\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
-
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
-
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
ASM_PREFETCH("[%[b_ptr], #640]")
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "b.ne 1b\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "b.ne 1b\n"
// Branch here if K=1 or 2. Do the right thing for odd/even at the end.
"4:\n"
- // Start final iteration - branch off to "odd" code before we load a0a.
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "cbnz %w[oddk], 2f\n"
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "cbnz %w[oddk], 2f\n"
// Even K continuation
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
ASM_PREFETCHW("[%[c_ptr]]")
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
ASM_PREFETCHW("[%[c_ptr], #64]")
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
ASM_PREFETCHW("[%[c_ptr], #128]")
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
ASM_PREFETCHW("[%[c_ptr], #192]")
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "ins %[b2].d[1], x20\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
ASM_PREFETCHW("[%[c_ptr], #256]")
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
ASM_PREFETCHW("[%[c_ptr], #320]")
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
ASM_PREFETCHWL2("[%[c_ptr], #448]")
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #512]")
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #576]")
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
ASM_PREFETCHWL2("[%[c_ptr], #640]")
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
ASM_PREFETCHWL2("[%[c_ptr], #704]")
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "b 3f\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
// Odd K continuation
"2:\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
ASM_PREFETCHW("[%[c_ptr]]")
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
ASM_PREFETCHW("[%[c_ptr], #64]")
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
ASM_PREFETCHW("[%[c_ptr], #128]")
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
ASM_PREFETCHW("[%[c_ptr], #192]")
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
ASM_PREFETCHW("[%[c_ptr], #256]")
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
ASM_PREFETCHW("[%[c_ptr], #320]")
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #448]")
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
ASM_PREFETCHWL2("[%[c_ptr], #512]")
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
ASM_PREFETCHWL2("[%[c_ptr], #576]")
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
ASM_PREFETCHWL2("[%[c_ptr], #640]")
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
ASM_PREFETCHWL2("[%[c_ptr], #704]")
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
// Common tail
"3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
index 5e1cce3233..711fc77d9f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
@@ -64,281 +64,281 @@ void a64_sgemm_asimd_8x12(const float *Apanel, const float *Bpanel, float *Cpane
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
ASM_PREFETCH("[%[a_ptr], #320]")
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr %q[a0], [%[a_ptr], #64]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "ldr %q[a1], [%[a_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
"fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
ASM_PREFETCH("[%[b_ptr], #512]")
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "bne 1b\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
"fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
"fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
"fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "str q24, [%[c_ptr], #32]\n"
-
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
"fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
"fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
"fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
// Common tail
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
index 1567b05d3e..a348b4b67e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
@@ -62,281 +62,281 @@ void a64_sgemm_asimd_8x12_x1(const float *Apanel, const float *Bpanel, float *Cp
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
ASM_PREFETCH("[%[a_ptr], #320]")
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
ASM_PREFETCH("[%[b_ptr], #448]")
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "ldr %q[a0], [%[a_ptr], #32]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[a1], [%[a_ptr], #48]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[a1], [%[a_ptr], #48]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
"fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
ASM_PREFETCH("[%[b_ptr], #512]")
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "bne 1b\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
"fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
"fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "ldr %q[a0], [%[a_ptr], #-32]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[a1], [%[a_ptr], #-16]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "ldr %q[b2], [%[b_ptr], #-16]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %q[a0], [%[a_ptr], #-32]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[a1], [%[a_ptr], #-16]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #-16]\n"
"fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "str q24, [%[c_ptr], #32]\n"
-
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
"fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
"fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "str q12, [%[c_ptr], #192]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
"fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "str q15, [%[c_ptr], #336]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
// Common tail
"3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
[a0] "+w" (a0), [a1] "+w" (a1),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp
index 2fed3264ab..b35cc91a5d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp
@@ -42,8 +42,7 @@ void a64_sgemm_asimd_8x6(const float *, const float *, float *, int, int, int);
// structure.
class cls_a64_sgemm_8x6 {
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
@@ -62,7 +61,7 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsFixed<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 6, 1> transforms = {};
kern_type kernel=a64_sgemm_asimd_8x6;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp
index fb5044684f..a968105af1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp
@@ -64,305 +64,305 @@ void a64_sgemm_asimd_8x6(const float *Apanel, const float *Bpanel, float *Cpanel
__asm __volatile (
// Initialize result registers, load initial operands, prime prefetches.
- "movi v8.2s, #0x0\n"
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "movi v9.2s, #0x0\n"
- "movi v10.2s, #0x0\n"
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "movi v11.2s, #0x0\n"
- "movi v12.2s, #0x0\n"
- "movi v13.2s, #0x0\n"
- "movi v14.2s, #0x0\n"
+ "movi v8.2s, #0x0\n"
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "movi v9.2s, #0x0\n"
+ "movi v10.2s, #0x0\n"
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "movi v11.2s, #0x0\n"
+ "movi v12.2s, #0x0\n"
+ "movi v13.2s, #0x0\n"
+ "movi v14.2s, #0x0\n"
ASM_PREFETCH("[%[b_ptr], #64]")
ASM_PREFETCHU("[%[a_ptr], #52]")
ASM_PREFETCHU("[%[a_ptr], #116]")
ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.2s, #0x0\n"
- "movi v16.2s, #0x0\n"
- "movi v17.2s, #0x0\n"
- "movi v18.2s, #0x0\n"
- "movi v19.2s, #0x0\n"
- "movi v20.2s, #0x0\n"
- "movi v21.2s, #0x0\n"
- "movi v22.2s, #0x0\n"
- "movi v23.2s, #0x0\n"
- "movi v24.2s, #0x0\n"
- "movi v25.2s, #0x0\n"
- "movi v26.2s, #0x0\n"
- "movi v27.2s, #0x0\n"
- "movi v28.2s, #0x0\n"
- "movi v29.2s, #0x0\n"
- "movi v30.2s, #0x0\n"
- "movi v31.2s, #0x0\n"
+ "movi v15.2s, #0x0\n"
+ "movi v16.2s, #0x0\n"
+ "movi v17.2s, #0x0\n"
+ "movi v18.2s, #0x0\n"
+ "movi v19.2s, #0x0\n"
+ "movi v20.2s, #0x0\n"
+ "movi v21.2s, #0x0\n"
+ "movi v22.2s, #0x0\n"
+ "movi v23.2s, #0x0\n"
+ "movi v24.2s, #0x0\n"
+ "movi v25.2s, #0x0\n"
+ "movi v26.2s, #0x0\n"
+ "movi v27.2s, #0x0\n"
+ "movi v28.2s, #0x0\n"
+ "movi v29.2s, #0x0\n"
+ "movi v30.2s, #0x0\n"
+ "movi v31.2s, #0x0\n"
// Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
+ "cbz %w[k], 4f\n"
// Loop proper
"1:\n"
- "ldr %d[b0], [%[b_ptr], #0]\n"
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
- "ldr %d[b1], [%[b_ptr], #8]\n"
- "fmla v8.2s , %[b0].2s, %[a0].2s\n"
- "fmla v9.2s , %[b0].2s, %[a1].2s\n"
- "fmla v10.2s, %[b0].2s, %[a2].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v16.2s, %[b1].2s, %[a0].2s\n"
- "fmla v17.2s, %[b1].2s, %[a1].2s\n"
- "fmla v11.2s, %[b0].2s, %[a3].2s\n"
-
- "ldr %d[b2], [%[b_ptr], #16]\n"
- "fmla v18.2s, %[b1].2s, %[a2].2s\n"
- "fmla v19.2s, %[b1].2s, %[a3].2s\n"
- "fmla v24.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v25.2s, %[b2].2s, %[a1].2s\n"
- "fmla v26.2s, %[b2].2s, %[a2].2s\n"
- "fmla v27.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "fmla v12.2s, %[b0].2s, %[a0].2s\n"
- "fmla v20.2s, %[b1].2s, %[a0].2s\n"
- "fmla v28.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
- "fmla v13.2s, %[b0].2s, %[a1].2s\n"
- "fmla v21.2s, %[b1].2s, %[a1].2s\n"
- "fmla v29.2s, %[b2].2s, %[a1].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v14.2s, %[b0].2s, %[a2].2s\n"
- "fmla v22.2s, %[b1].2s, %[a2].2s\n"
- "fmla v30.2s, %[b2].2s, %[a2].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v15.2s, %[b0].2s, %[a3].2s\n"
- "fmla v23.2s, %[b1].2s, %[a3].2s\n"
- "fmla v31.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "ldr %d[b0], [%[b_ptr], #0]\n"
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "ldr %d[b1], [%[b_ptr], #8]\n"
+ "fmla v8.2s , %[b0].2s, %[a0].2s\n"
+ "fmla v9.2s , %[b0].2s, %[a1].2s\n"
+ "fmla v10.2s, %[b0].2s, %[a2].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v16.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v17.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v11.2s, %[b0].2s, %[a3].2s\n"
+
+ "ldr %d[b2], [%[b_ptr], #16]\n"
+ "fmla v18.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v19.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v24.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v25.2s, %[b2].2s, %[a1].2s\n"
+ "fmla v26.2s, %[b2].2s, %[a2].2s\n"
+ "fmla v27.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "fmla v12.2s, %[b0].2s, %[a0].2s\n"
+ "fmla v20.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v28.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "fmla v13.2s, %[b0].2s, %[a1].2s\n"
+ "fmla v21.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v29.2s, %[b2].2s, %[a1].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v14.2s, %[b0].2s, %[a2].2s\n"
+ "fmla v22.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v30.2s, %[b2].2s, %[a2].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v15.2s, %[b0].2s, %[a3].2s\n"
+ "fmla v23.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v31.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "subs %w[k], %w[k], #1\n"
+ "subs %w[k], %w[k], #1\n"
ASM_PREFETCHU("[%[a_ptr], #156]")
- "ldr %d[b0], [%[b_ptr], #24]\n"
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
-
- "ldr %d[b1], [%[b_ptr], #32]\n"
- "fmla v8.2s , %[b0].2s, %[a0].2s\n"
- "fmla v9.2s , %[b0].2s, %[a1].2s\n"
- "fmla v10.2s, %[b0].2s, %[a2].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v16.2s, %[b1].2s, %[a0].2s\n"
- "fmla v17.2s, %[b1].2s, %[a1].2s\n"
- "fmla v11.2s, %[b0].2s, %[a3].2s\n"
-
- "ldr %d[b2], [%[b_ptr], #40]\n"
- "fmla v18.2s, %[b1].2s, %[a2].2s\n"
- "fmla v19.2s, %[b1].2s, %[a3].2s\n"
- "fmla v24.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v25.2s, %[b2].2s, %[a1].2s\n"
- "fmla v26.2s, %[b2].2s, %[a2].2s\n"
- "fmla v27.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "fmla v12.2s, %[b0].2s, %[a0].2s\n"
- "fmla v20.2s, %[b1].2s, %[a0].2s\n"
- "fmla v28.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
- "fmla v13.2s, %[b0].2s, %[a1].2s\n"
- "fmla v21.2s, %[b1].2s, %[a1].2s\n"
- "fmla v29.2s, %[b2].2s, %[a1].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v14.2s, %[b0].2s, %[a2].2s\n"
- "fmla v22.2s, %[b1].2s, %[a2].2s\n"
- "fmla v30.2s, %[b2].2s, %[a2].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v15.2s, %[b0].2s, %[a3].2s\n"
- "fmla v23.2s, %[b1].2s, %[a3].2s\n"
- "fmla v31.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "add %[b_ptr], %[b_ptr], #48\n"
+ "ldr %d[b0], [%[b_ptr], #24]\n"
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+
+ "ldr %d[b1], [%[b_ptr], #32]\n"
+ "fmla v8.2s , %[b0].2s, %[a0].2s\n"
+ "fmla v9.2s , %[b0].2s, %[a1].2s\n"
+ "fmla v10.2s, %[b0].2s, %[a2].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v16.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v17.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v11.2s, %[b0].2s, %[a3].2s\n"
+
+ "ldr %d[b2], [%[b_ptr], #40]\n"
+ "fmla v18.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v19.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v24.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v25.2s, %[b2].2s, %[a1].2s\n"
+ "fmla v26.2s, %[b2].2s, %[a2].2s\n"
+ "fmla v27.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "fmla v12.2s, %[b0].2s, %[a0].2s\n"
+ "fmla v20.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v28.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "fmla v13.2s, %[b0].2s, %[a1].2s\n"
+ "fmla v21.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v29.2s, %[b2].2s, %[a1].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v14.2s, %[b0].2s, %[a2].2s\n"
+ "fmla v22.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v30.2s, %[b2].2s, %[a2].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v15.2s, %[b0].2s, %[a3].2s\n"
+ "fmla v23.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v31.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
ASM_PREFETCHU("[%[a_ptr], #188]")
- "bne 1b\n"
+ "bne 1b\n"
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
"4:\n"
ASM_PREFETCH("[%[c_ptr]]")
ASM_PREFETCH("[%[c_ptr], #64]")
- "ldr %d[b0], [%[b_ptr]]\n"
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "ldr %d[b0], [%[b_ptr]]\n"
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
// Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
+ "cbnz %w[oddk], 2f\n"
// Detached final iteration (even K)
- "ldr %d[b1], [%[b_ptr], #8]\n"
- "fmla v8.2s , %[b0].2s, %[a0].2s\n"
- "fmla v9.2s , %[b0].2s, %[a1].2s\n"
- "fmla v10.2s, %[b0].2s, %[a2].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v16.2s, %[b1].2s, %[a0].2s\n"
- "fmla v17.2s, %[b1].2s, %[a1].2s\n"
- "fmla v11.2s, %[b0].2s, %[a3].2s\n"
-
- "ldr %d[b2], [%[b_ptr], #16]\n"
- "fmla v18.2s, %[b1].2s, %[a2].2s\n"
- "fmla v19.2s, %[b1].2s, %[a3].2s\n"
- "fmla v24.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v25.2s, %[b2].2s, %[a1].2s\n"
- "fmla v26.2s, %[b2].2s, %[a2].2s\n"
- "fmla v27.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "fmla v12.2s, %[b0].2s, %[a0].2s\n"
- "fmla v20.2s, %[b1].2s, %[a0].2s\n"
- "fmla v28.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
- "fmla v13.2s, %[b0].2s, %[a1].2s\n"
- "fmla v21.2s, %[b1].2s, %[a1].2s\n"
- "fmla v29.2s, %[b2].2s, %[a1].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v14.2s, %[b0].2s, %[a2].2s\n"
- "fmla v22.2s, %[b1].2s, %[a2].2s\n"
- "fmla v30.2s, %[b2].2s, %[a2].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v15.2s, %[b0].2s, %[a3].2s\n"
- "fmla v23.2s, %[b1].2s, %[a3].2s\n"
- "fmla v31.2s, %[b2].2s, %[a3].2s\n"
-
- "ldr %d[b0], [%[b_ptr], #24]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
+ "ldr %d[b1], [%[b_ptr], #8]\n"
+ "fmla v8.2s , %[b0].2s, %[a0].2s\n"
+ "fmla v9.2s , %[b0].2s, %[a1].2s\n"
+ "fmla v10.2s, %[b0].2s, %[a2].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v16.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v17.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v11.2s, %[b0].2s, %[a3].2s\n"
+
+ "ldr %d[b2], [%[b_ptr], #16]\n"
+ "fmla v18.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v19.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v24.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v25.2s, %[b2].2s, %[a1].2s\n"
+ "fmla v26.2s, %[b2].2s, %[a2].2s\n"
+ "fmla v27.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "fmla v12.2s, %[b0].2s, %[a0].2s\n"
+ "fmla v20.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v28.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "fmla v13.2s, %[b0].2s, %[a1].2s\n"
+ "fmla v21.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v29.2s, %[b2].2s, %[a1].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v14.2s, %[b0].2s, %[a2].2s\n"
+ "fmla v22.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v30.2s, %[b2].2s, %[a2].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v15.2s, %[b0].2s, %[a3].2s\n"
+ "fmla v23.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v31.2s, %[b2].2s, %[a3].2s\n"
+
+ "ldr %d[b0], [%[b_ptr], #24]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
ASM_PREFETCH("[%[b_ptr], #128]")
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
-
- "ldr %d[b1], [%[b_ptr], #-16]\n"
- "fmla v8.2s , %[b0].2s, %[a0].2s\n"
- "fmla v9.2s , %[b0].2s, %[a1].2s\n"
- "fmla v10.2s, %[b0].2s, %[a2].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v16.2s, %[b1].2s, %[a0].2s\n"
- "fmla v17.2s, %[b1].2s, %[a1].2s\n"
- "fmla v11.2s, %[b0].2s, %[a3].2s\n"
-
- "ldr %d[b2], [%[b_ptr], #-8]\n"
- "fmla v18.2s, %[b1].2s, %[a2].2s\n"
- "fmla v19.2s, %[b1].2s, %[a3].2s\n"
- "fmla v24.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v25.2s, %[b2].2s, %[a1].2s\n"
- "fmla v26.2s, %[b2].2s, %[a2].2s\n"
- "fmla v27.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "fmla v12.2s, %[b0].2s, %[a0].2s\n"
- "fmla v20.2s, %[b1].2s, %[a0].2s\n"
- "fmla v28.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
- "fmla v13.2s, %[b0].2s, %[a1].2s\n"
- "fmla v21.2s, %[b1].2s, %[a1].2s\n"
- "fmla v29.2s, %[b2].2s, %[a1].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v14.2s, %[b0].2s, %[a2].2s\n"
- "fmla v22.2s, %[b1].2s, %[a2].2s\n"
- "fmla v30.2s, %[b2].2s, %[a2].2s\n"
-
- "fmla v15.2s, %[b0].2s, %[a3].2s\n"
- "fmla v23.2s, %[b1].2s, %[a3].2s\n"
- "fmla v31.2s, %[b2].2s, %[a3].2s\n"
-
- "b 3f\n"
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+
+ "ldr %d[b1], [%[b_ptr], #-16]\n"
+ "fmla v8.2s , %[b0].2s, %[a0].2s\n"
+ "fmla v9.2s , %[b0].2s, %[a1].2s\n"
+ "fmla v10.2s, %[b0].2s, %[a2].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v16.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v17.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v11.2s, %[b0].2s, %[a3].2s\n"
+
+ "ldr %d[b2], [%[b_ptr], #-8]\n"
+ "fmla v18.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v19.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v24.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v25.2s, %[b2].2s, %[a1].2s\n"
+ "fmla v26.2s, %[b2].2s, %[a2].2s\n"
+ "fmla v27.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "fmla v12.2s, %[b0].2s, %[a0].2s\n"
+ "fmla v20.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v28.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "fmla v13.2s, %[b0].2s, %[a1].2s\n"
+ "fmla v21.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v29.2s, %[b2].2s, %[a1].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v14.2s, %[b0].2s, %[a2].2s\n"
+ "fmla v22.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v30.2s, %[b2].2s, %[a2].2s\n"
+
+ "fmla v15.2s, %[b0].2s, %[a3].2s\n"
+ "fmla v23.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v31.2s, %[b2].2s, %[a3].2s\n"
+
+ "b 3f\n"
// Detached final iteration (odd K)
"2:\n"
- "ldr %d[b1], [%[b_ptr], #8]\n"
- "fmla v8.2s , %[b0].2s, %[a0].2s\n"
- "fmla v9.2s , %[b0].2s, %[a1].2s\n"
- "fmla v10.2s, %[b0].2s, %[a2].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v16.2s, %[b1].2s, %[a0].2s\n"
- "fmla v17.2s, %[b1].2s, %[a1].2s\n"
- "fmla v11.2s, %[b0].2s, %[a3].2s\n"
-
- "ldr %d[b2], [%[b_ptr], #16]\n"
- "fmla v18.2s, %[b1].2s, %[a2].2s\n"
- "fmla v19.2s, %[b1].2s, %[a3].2s\n"
- "fmla v24.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
- "fmla v25.2s, %[b2].2s, %[a1].2s\n"
- "fmla v26.2s, %[b2].2s, %[a2].2s\n"
- "fmla v27.2s, %[b2].2s, %[a3].2s\n"
-
- "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
- "fmla v12.2s, %[b0].2s, %[a0].2s\n"
- "fmla v20.2s, %[b1].2s, %[a0].2s\n"
- "fmla v28.2s, %[b2].2s, %[a0].2s\n"
-
- "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
- "fmla v13.2s, %[b0].2s, %[a1].2s\n"
- "fmla v21.2s, %[b1].2s, %[a1].2s\n"
- "fmla v29.2s, %[b2].2s, %[a1].2s\n"
-
- "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
- "fmla v14.2s, %[b0].2s, %[a2].2s\n"
- "fmla v22.2s, %[b1].2s, %[a2].2s\n"
- "fmla v30.2s, %[b2].2s, %[a2].2s\n"
-
- "fmla v15.2s, %[b0].2s, %[a3].2s\n"
- "fmla v23.2s, %[b1].2s, %[a3].2s\n"
- "fmla v31.2s, %[b2].2s, %[a3].2s\n"
-
- "add %[b_ptr], %[b_ptr], #24\n"
+ "ldr %d[b1], [%[b_ptr], #8]\n"
+ "fmla v8.2s , %[b0].2s, %[a0].2s\n"
+ "fmla v9.2s , %[b0].2s, %[a1].2s\n"
+ "fmla v10.2s, %[b0].2s, %[a2].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v16.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v17.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v11.2s, %[b0].2s, %[a3].2s\n"
+
+ "ldr %d[b2], [%[b_ptr], #16]\n"
+ "fmla v18.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v19.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v24.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
+ "fmla v25.2s, %[b2].2s, %[a1].2s\n"
+ "fmla v26.2s, %[b2].2s, %[a2].2s\n"
+ "fmla v27.2s, %[b2].2s, %[a3].2s\n"
+
+ "ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
+ "fmla v12.2s, %[b0].2s, %[a0].2s\n"
+ "fmla v20.2s, %[b1].2s, %[a0].2s\n"
+ "fmla v28.2s, %[b2].2s, %[a0].2s\n"
+
+ "ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
+ "fmla v13.2s, %[b0].2s, %[a1].2s\n"
+ "fmla v21.2s, %[b1].2s, %[a1].2s\n"
+ "fmla v29.2s, %[b2].2s, %[a1].2s\n"
+
+ "ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
+ "fmla v14.2s, %[b0].2s, %[a2].2s\n"
+ "fmla v22.2s, %[b1].2s, %[a2].2s\n"
+ "fmla v30.2s, %[b2].2s, %[a2].2s\n"
+
+ "fmla v15.2s, %[b0].2s, %[a3].2s\n"
+ "fmla v23.2s, %[b1].2s, %[a3].2s\n"
+ "fmla v31.2s, %[b2].2s, %[a3].2s\n"
+
+ "add %[b_ptr], %[b_ptr], #24\n"
// Common tail
"3:\n"
- "str d8, [%[c_ptr], #0]\n"
- "str d16, [%[c_ptr], #8]\n"
- "str d24, [%[c_ptr], #16]\n"
- "str d9, [%[c_ptr], #24]\n"
- "str d17, [%[c_ptr], #32]\n"
- "str d25, [%[c_ptr], #40]\n"
- "str d10, [%[c_ptr], #48]\n"
- "str d18, [%[c_ptr], #56]\n"
- "str d26, [%[c_ptr], #64]\n"
- "str d11, [%[c_ptr], #72]\n"
- "str d19, [%[c_ptr], #80]\n"
- "str d27, [%[c_ptr], #88]\n"
- "str d12, [%[c_ptr], #96]\n"
- "str d20, [%[c_ptr], #104]\n"
- "str d28, [%[c_ptr], #112]\n"
- "str d13, [%[c_ptr], #120]\n"
- "str d21, [%[c_ptr], #128]\n"
- "str d29, [%[c_ptr], #136]\n"
- "str d14, [%[c_ptr], #144]\n"
- "str d22, [%[c_ptr], #152]\n"
- "str d30, [%[c_ptr], #160]\n"
- "str d15, [%[c_ptr], #168]\n"
- "str d23, [%[c_ptr], #176]\n"
- "str d31, [%[c_ptr], #184]\n"
- "add %[c_ptr], %[c_ptr], #192\n"
+ "str d8, [%[c_ptr], #0]\n"
+ "str d16, [%[c_ptr], #8]\n"
+ "str d24, [%[c_ptr], #16]\n"
+ "str d9, [%[c_ptr], #24]\n"
+ "str d17, [%[c_ptr], #32]\n"
+ "str d25, [%[c_ptr], #40]\n"
+ "str d10, [%[c_ptr], #48]\n"
+ "str d18, [%[c_ptr], #56]\n"
+ "str d26, [%[c_ptr], #64]\n"
+ "str d11, [%[c_ptr], #72]\n"
+ "str d19, [%[c_ptr], #80]\n"
+ "str d27, [%[c_ptr], #88]\n"
+ "str d12, [%[c_ptr], #96]\n"
+ "str d20, [%[c_ptr], #104]\n"
+ "str d28, [%[c_ptr], #112]\n"
+ "str d13, [%[c_ptr], #120]\n"
+ "str d21, [%[c_ptr], #128]\n"
+ "str d29, [%[c_ptr], #136]\n"
+ "str d14, [%[c_ptr], #144]\n"
+ "str d22, [%[c_ptr], #152]\n"
+ "str d30, [%[c_ptr], #160]\n"
+ "str d15, [%[c_ptr], #168]\n"
+ "str d23, [%[c_ptr], #176]\n"
+ "str d31, [%[c_ptr], #184]\n"
+ "add %[c_ptr], %[c_ptr], #192\n"
:
[a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
index 3616f39f2a..702b5f69ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
@@ -170,28 +170,28 @@ void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y,
x0 = vld1q_f32(x_ptr);
__asm __volatile (
- "ldr q2, [%[a_ptr], #0]\n"
- "ldr q3, [%[a_ptr], #16]\n"
- "ldr q4, [%[a_ptr], #32]\n"
- "ldr q5, [%[a_ptr], #48]\n"
- "ldr q6, [%[a_ptr], #64]\n"
- "ldr q7, [%[a_ptr], #80]\n"
- "ldr q8, [%[a_ptr], #96]\n"
- "ldr q9, [%[a_ptr], #112]\n"
- "ldr q10, [%[a_ptr], #128]\n"
- "ldr q11, [%[a_ptr], #144]\n"
- "ldr q12, [%[a_ptr], #160]\n"
- "ldr q13, [%[a_ptr], #176]\n"
- "ldr q14, [%[a_ptr], #192]\n"
- "ldr q15, [%[a_ptr], #208]\n"
- "ldr q16, [%[a_ptr], #224]\n"
- "ldr q17, [%[a_ptr], #240]\n"
- "ldr q18, [%[a_ptr], #256]\n"
- "ldr q19, [%[a_ptr], #272]\n"
- "ldr q20, [%[a_ptr], #288]\n"
- "ldr q21, [%[a_ptr], #304]\n"
- "ldr q22, [%[a_ptr], #320]\n"
- "ldr q23, [%[a_ptr], #336]\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr q10, [%[a_ptr], #128]\n"
+ "ldr q11, [%[a_ptr], #144]\n"
+ "ldr q12, [%[a_ptr], #160]\n"
+ "ldr q13, [%[a_ptr], #176]\n"
+ "ldr q14, [%[a_ptr], #192]\n"
+ "ldr q15, [%[a_ptr], #208]\n"
+ "ldr q16, [%[a_ptr], #224]\n"
+ "ldr q17, [%[a_ptr], #240]\n"
+ "ldr q18, [%[a_ptr], #256]\n"
+ "ldr q19, [%[a_ptr], #272]\n"
+ "ldr q20, [%[a_ptr], #288]\n"
+ "ldr q21, [%[a_ptr], #304]\n"
+ "ldr q22, [%[a_ptr], #320]\n"
+ "ldr q23, [%[a_ptr], #336]\n"
ASM_PREFETCH("[%[a_ptr], #384]")
ASM_PREFETCH("[%[a_ptr], #448]")
ASM_PREFETCH("[%[a_ptr], #512]")
@@ -218,305 +218,305 @@ void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y,
ASM_PREFETCH("[%[a_ptr], #1856]")
ASM_PREFETCH("[%[a_ptr], #1920]")
ASM_PREFETCH("[%[a_ptr], #1984]")
- "add %[a_ptr], %[a_ptr], #352\n"
+ "add %[a_ptr], %[a_ptr], #352\n"
- "cbz %w[k], 2f\n"
+ "cbz %w[k], 2f\n"
"1:\n"
// Unroll 0
- "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
- "ldr %q[x0a], [%[x_ptr], #16]\n"
- "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
- "ldr q3, [%[a_ptr], #0]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
- "ldr q4, [%[a_ptr], #16]\n"
- "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
- "ldr q5, [%[a_ptr], #32]\n"
- "add %[x_ptr], %[x_ptr], #32\n"
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr %q[x0a], [%[x_ptr], #16]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #0]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #16]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #32]\n"
+ "add %[x_ptr], %[x_ptr], #32\n"
ASM_PREFETCH("[%[a_ptr], #1664]")
- "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
- "ldr q6, [%[a_ptr], #48]\n"
- "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
- "ldr q7, [%[a_ptr], #64]\n"
- "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
- "ldr q8, [%[a_ptr], #80]\n"
- "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
- "ldr q9, [%[a_ptr], #96]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #48]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #64]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #80]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #96]\n"
ASM_PREFETCH("[%[a_ptr], #1728]")
// Unroll 1
- "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
- "ldr q10, [%[a_ptr], #112]\n"
- "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
- "ldr q11, [%[a_ptr], #128]\n"
- "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
- "ldr q12, [%[a_ptr], #144]\n"
- "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
- "ldr q13, [%[a_ptr], #160]\n"
+ "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
+ "ldr q10, [%[a_ptr], #112]\n"
+ "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
+ "ldr q11, [%[a_ptr], #128]\n"
+ "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
+ "ldr q12, [%[a_ptr], #144]\n"
+ "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
+ "ldr q13, [%[a_ptr], #160]\n"
ASM_PREFETCH("[%[a_ptr], #1792]")
- "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
- "ldr q14, [%[a_ptr], #176]\n"
- "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
- "ldr q15, [%[a_ptr], #192]\n"
- "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
- "ldr q16, [%[a_ptr], #208]\n"
- "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
- "ldr q17, [%[a_ptr], #224]\n"
+ "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
+ "ldr q14, [%[a_ptr], #176]\n"
+ "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
+ "ldr q15, [%[a_ptr], #192]\n"
+ "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
+ "ldr q16, [%[a_ptr], #208]\n"
+ "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
+ "ldr q17, [%[a_ptr], #224]\n"
ASM_PREFETCH("[%[a_ptr], #1856]")
// Unroll 2
- "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
- "ldr q18, [%[a_ptr], #240]\n"
- "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
- "ldr q19, [%[a_ptr], #256]\n"
- "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
- "ldr q20, [%[a_ptr], #272]\n"
- "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
- "ldr q21, [%[a_ptr], #288]\n"
+ "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
+ "ldr q18, [%[a_ptr], #240]\n"
+ "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
+ "ldr q19, [%[a_ptr], #256]\n"
+ "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
+ "ldr q20, [%[a_ptr], #272]\n"
+ "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
+ "ldr q21, [%[a_ptr], #288]\n"
ASM_PREFETCH("[%[a_ptr], #1920]")
- "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
- "ldr q22, [%[a_ptr], #304]\n"
- "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
- "ldr q23, [%[a_ptr], #320]\n"
- "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
- "ldr q2, [%[a_ptr], #336]\n"
- "ldr q3, [%[a_ptr], #352]\n"
- "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
- "ldr q4, [%[a_ptr], #368]\n"
+ "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
+ "ldr q22, [%[a_ptr], #304]\n"
+ "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
+ "ldr q23, [%[a_ptr], #320]\n"
+ "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
+ "ldr q2, [%[a_ptr], #336]\n"
+ "ldr q3, [%[a_ptr], #352]\n"
+ "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
+ "ldr q4, [%[a_ptr], #368]\n"
ASM_PREFETCH("[%[a_ptr], #1984]")
// Unroll 3
- "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
- "ldr q5, [%[a_ptr], #384]\n"
- "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
- "ldr q6, [%[a_ptr], #400]\n"
- "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
- "ldr q7, [%[a_ptr], #416]\n"
- "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
+ "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
+ "ldr q5, [%[a_ptr], #384]\n"
+ "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
+ "ldr q6, [%[a_ptr], #400]\n"
+ "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
+ "ldr q7, [%[a_ptr], #416]\n"
+ "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
ASM_PREFETCH("[%[a_ptr], #2048]")
- "ldr q8, [%[a_ptr], #432]\n"
- "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
- "ldr q9, [%[a_ptr], #448]\n"
- "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
- "ldr q10, [%[a_ptr], #464]\n"
- "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
- "ldr q11, [%[a_ptr], #480]\n"
- "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
- "ldr q12, [%[a_ptr], #496]\n"
+ "ldr q8, [%[a_ptr], #432]\n"
+ "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
+ "ldr q9, [%[a_ptr], #448]\n"
+ "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
+ "ldr q10, [%[a_ptr], #464]\n"
+ "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
+ "ldr q11, [%[a_ptr], #480]\n"
+ "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
+ "ldr q12, [%[a_ptr], #496]\n"
ASM_PREFETCH("[%[a_ptr], #2112]")
// Unroll 4
- "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
- "ldr %q[x0], [%[x_ptr]]\n"
- "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
- "ldr q14, [%[a_ptr], #512]\n"
- "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
- "ldr q15, [%[a_ptr], #528]\n"
- "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
+ "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
+ "ldr %q[x0], [%[x_ptr]]\n"
+ "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
+ "ldr q14, [%[a_ptr], #512]\n"
+ "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
+ "ldr q15, [%[a_ptr], #528]\n"
+ "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
ASM_PREFETCH("[%[a_ptr], #2176]")
- "ldr q16, [%[a_ptr], #544]\n"
- "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
- "ldr q17, [%[a_ptr], #560]\n"
- "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
- "ldr q18, [%[a_ptr], #576]\n"
- "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
- "ldr q19, [%[a_ptr], #592]\n"
- "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
- "ldr q20, [%[a_ptr], #608]\n"
+ "ldr q16, [%[a_ptr], #544]\n"
+ "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
+ "ldr q17, [%[a_ptr], #560]\n"
+ "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
+ "ldr q18, [%[a_ptr], #576]\n"
+ "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
+ "ldr q19, [%[a_ptr], #592]\n"
+ "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
+ "ldr q20, [%[a_ptr], #608]\n"
ASM_PREFETCH("[%[a_ptr], #2240]")
// Unroll 5
- "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
- "ldr q21, [%[a_ptr], #624]\n"
- "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
- "ldr q22, [%[a_ptr], #640]\n"
- "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
- "ldr q23, [%[a_ptr], #656]\n"
- "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
- "ldr q2, [%[a_ptr], #672]\n"
+ "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
+ "ldr q21, [%[a_ptr], #624]\n"
+ "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
+ "ldr q22, [%[a_ptr], #640]\n"
+ "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
+ "ldr q23, [%[a_ptr], #656]\n"
+ "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
+ "ldr q2, [%[a_ptr], #672]\n"
ASM_PREFETCH("[%[a_ptr], #2304]")
- "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
- "ldr q3, [%[a_ptr], #688]\n"
- "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
- "ldr q4, [%[a_ptr], #704]\n"
- "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
- "ldr q5, [%[a_ptr], #720]\n"
- "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
- "ldr q6, [%[a_ptr], #736]\n"
+ "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
+ "ldr q3, [%[a_ptr], #688]\n"
+ "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
+ "ldr q4, [%[a_ptr], #704]\n"
+ "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
+ "ldr q5, [%[a_ptr], #720]\n"
+ "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
+ "ldr q6, [%[a_ptr], #736]\n"
ASM_PREFETCH("[%[a_ptr], #2368]")
// Unroll 6
- "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
- "ldr q7, [%[a_ptr], #752]\n"
- "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
- "ldr q8, [%[a_ptr], #768]\n"
- "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
- "ldr q9, [%[a_ptr], #784]\n"
- "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
- "ldr q10, [%[a_ptr], #800]\n"
+ "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
+ "ldr q7, [%[a_ptr], #752]\n"
+ "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
+ "ldr q8, [%[a_ptr], #768]\n"
+ "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
+ "ldr q9, [%[a_ptr], #784]\n"
+ "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
+ "ldr q10, [%[a_ptr], #800]\n"
ASM_PREFETCH("[%[a_ptr], #2432]")
- "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
- "ldr q11, [%[a_ptr], #816]\n"
- "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
- "ldr q12, [%[a_ptr], #832]\n"
- "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
- "ldr q13, [%[a_ptr], #848]\n"
- "ldr q14, [%[a_ptr], #864]\n"
- "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
- "ldr q15, [%[a_ptr], #880]\n"
+ "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
+ "ldr q11, [%[a_ptr], #816]\n"
+ "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
+ "ldr q12, [%[a_ptr], #832]\n"
+ "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
+ "ldr q13, [%[a_ptr], #848]\n"
+ "ldr q14, [%[a_ptr], #864]\n"
+ "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
+ "ldr q15, [%[a_ptr], #880]\n"
ASM_PREFETCH("[%[a_ptr], #2496]")
// Unroll 7
- "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
- "ldr q16, [%[a_ptr], #896]\n"
- "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
- "ldr q17, [%[a_ptr], #912]\n"
- "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
- "ldr q18, [%[a_ptr], #928]\n"
- "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
+ "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
+ "ldr q16, [%[a_ptr], #896]\n"
+ "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
+ "ldr q17, [%[a_ptr], #912]\n"
+ "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
+ "ldr q18, [%[a_ptr], #928]\n"
+ "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
ASM_PREFETCH("[%[a_ptr], #2560]")
- "ldr q19, [%[a_ptr], #944]\n"
- "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
- "ldr q20, [%[a_ptr], #960]\n"
- "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
- "ldr q21, [%[a_ptr], #976]\n"
- "add %[a_ptr], %[a_ptr], #1024\n"
- "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
- "ldr q22, [%[a_ptr], #-32]\n"
- "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
- "ldr q23, [%[a_ptr], #-16]\n"
+ "ldr q19, [%[a_ptr], #944]\n"
+ "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
+ "ldr q20, [%[a_ptr], #960]\n"
+ "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
+ "ldr q21, [%[a_ptr], #976]\n"
+ "add %[a_ptr], %[a_ptr], #1024\n"
+ "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
+ "ldr q22, [%[a_ptr], #-32]\n"
+ "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
+ "ldr q23, [%[a_ptr], #-16]\n"
ASM_PREFETCH("[%[a_ptr], #1600]")
- "bne 1b\n"
+ "bne 1b\n"
// Detached final iteration
"2:\n"
// Unroll 0
- "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
- "ldr %q[x0a], [%[x_ptr], #16]\n"
- "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
- "ldr q3, [%[a_ptr], #0]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
- "ldr q4, [%[a_ptr], #16]\n"
- "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
- "ldr q5, [%[a_ptr], #32]\n"
- "add %[x_ptr], %[x_ptr], #32\n"
- "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
- "ldr q6, [%[a_ptr], #48]\n"
- "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
- "ldr q7, [%[a_ptr], #64]\n"
- "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
- "ldr q8, [%[a_ptr], #80]\n"
- "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
- "ldr q9, [%[a_ptr], #96]\n"
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr %q[x0a], [%[x_ptr], #16]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #0]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #16]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #32]\n"
+ "add %[x_ptr], %[x_ptr], #32\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #48]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #64]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #80]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #96]\n"
// Unroll 1
- "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
- "ldr q10, [%[a_ptr], #112]\n"
- "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
- "ldr q11, [%[a_ptr], #128]\n"
- "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
- "ldr q12, [%[a_ptr], #144]\n"
- "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
- "ldr q13, [%[a_ptr], #160]\n"
- "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
- "ldr q14, [%[a_ptr], #176]\n"
- "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
- "ldr q15, [%[a_ptr], #192]\n"
- "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
- "ldr q16, [%[a_ptr], #208]\n"
- "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
- "ldr q17, [%[a_ptr], #224]\n"
+ "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
+ "ldr q10, [%[a_ptr], #112]\n"
+ "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
+ "ldr q11, [%[a_ptr], #128]\n"
+ "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
+ "ldr q12, [%[a_ptr], #144]\n"
+ "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
+ "ldr q13, [%[a_ptr], #160]\n"
+ "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
+ "ldr q14, [%[a_ptr], #176]\n"
+ "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
+ "ldr q15, [%[a_ptr], #192]\n"
+ "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
+ "ldr q16, [%[a_ptr], #208]\n"
+ "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
+ "ldr q17, [%[a_ptr], #224]\n"
// Unroll 2
- "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
- "ldr q18, [%[a_ptr], #240]\n"
- "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
- "ldr q19, [%[a_ptr], #256]\n"
- "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
- "ldr q20, [%[a_ptr], #272]\n"
- "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
- "ldr q21, [%[a_ptr], #288]\n"
- "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
- "ldr q22, [%[a_ptr], #304]\n"
- "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
- "ldr q23, [%[a_ptr], #320]\n"
- "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
- "ldr q2, [%[a_ptr], #336]\n"
- "ldr q3, [%[a_ptr], #352]\n"
- "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
- "ldr q4, [%[a_ptr], #368]\n"
+ "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
+ "ldr q18, [%[a_ptr], #240]\n"
+ "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
+ "ldr q19, [%[a_ptr], #256]\n"
+ "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
+ "ldr q20, [%[a_ptr], #272]\n"
+ "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
+ "ldr q21, [%[a_ptr], #288]\n"
+ "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
+ "ldr q22, [%[a_ptr], #304]\n"
+ "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
+ "ldr q23, [%[a_ptr], #320]\n"
+ "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
+ "ldr q2, [%[a_ptr], #336]\n"
+ "ldr q3, [%[a_ptr], #352]\n"
+ "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
+ "ldr q4, [%[a_ptr], #368]\n"
// Unroll 3
- "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
- "ldr q5, [%[a_ptr], #384]\n"
- "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
- "ldr q6, [%[a_ptr], #400]\n"
- "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
- "ldr q7, [%[a_ptr], #416]\n"
- "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
- "ldr q8, [%[a_ptr], #432]\n"
- "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
- "ldr q9, [%[a_ptr], #448]\n"
- "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
- "ldr q10, [%[a_ptr], #464]\n"
- "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
- "ldr q11, [%[a_ptr], #480]\n"
- "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
- "ldr q12, [%[a_ptr], #496]\n"
+ "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
+ "ldr q5, [%[a_ptr], #384]\n"
+ "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
+ "ldr q6, [%[a_ptr], #400]\n"
+ "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
+ "ldr q7, [%[a_ptr], #416]\n"
+ "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
+ "ldr q8, [%[a_ptr], #432]\n"
+ "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
+ "ldr q9, [%[a_ptr], #448]\n"
+ "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
+ "ldr q10, [%[a_ptr], #464]\n"
+ "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
+ "ldr q11, [%[a_ptr], #480]\n"
+ "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
+ "ldr q12, [%[a_ptr], #496]\n"
// Unroll 4
- "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
- "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
- "ldr q14, [%[a_ptr], #512]\n"
- "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
- "ldr q15, [%[a_ptr], #528]\n"
- "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
- "ldr q16, [%[a_ptr], #544]\n"
- "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
- "ldr q17, [%[a_ptr], #560]\n"
- "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
- "ldr q18, [%[a_ptr], #576]\n"
- "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
- "ldr q19, [%[a_ptr], #592]\n"
- "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
- "ldr q20, [%[a_ptr], #608]\n"
+ "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
+ "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
+ "ldr q14, [%[a_ptr], #512]\n"
+ "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
+ "ldr q15, [%[a_ptr], #528]\n"
+ "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
+ "ldr q16, [%[a_ptr], #544]\n"
+ "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
+ "ldr q17, [%[a_ptr], #560]\n"
+ "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
+ "ldr q18, [%[a_ptr], #576]\n"
+ "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
+ "ldr q19, [%[a_ptr], #592]\n"
+ "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
+ "ldr q20, [%[a_ptr], #608]\n"
// Unroll 5
- "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
- "ldr q21, [%[a_ptr], #624]\n"
- "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
- "ldr q22, [%[a_ptr], #640]\n"
- "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
- "ldr q23, [%[a_ptr], #656]\n"
- "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #672\n"
- "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
- "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
- "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
- "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
+ "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
+ "ldr q21, [%[a_ptr], #624]\n"
+ "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
+ "ldr q22, [%[a_ptr], #640]\n"
+ "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
+ "ldr q23, [%[a_ptr], #656]\n"
+ "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #672\n"
+ "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
+ "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
+ "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
+ "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
// Unroll 6
- "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
- "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
- "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
- "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
- "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
- "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
- "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
- "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
+ "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
+ "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
+ "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
+ "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
+ "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
+ "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
+ "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
+ "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
// Unroll 7
- "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
- "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
- "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
- "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
- "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
- "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
- "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
- "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
+ "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
+ "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
+ "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
+ "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
+ "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
+ "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
+ "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
+ "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
:
[a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr),
[x0] "+w" (x0), [x0a] "+w" (x0a), [k] "+r" (k),
@@ -532,53 +532,53 @@ void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y,
int l=(M%8)-1;
__asm __volatile (
- "ldr q2, [%[a_ptr], #0]\n"
- "ldr q3, [%[a_ptr], #16]\n"
- "ldr q4, [%[a_ptr], #32]\n"
- "ldr q5, [%[a_ptr], #48]\n"
- "ldr q6, [%[a_ptr], #64]\n"
- "ldr q7, [%[a_ptr], #80]\n"
- "ldr q8, [%[a_ptr], #96]\n"
- "ldr q9, [%[a_ptr], #112]\n"
- "ldr %s[x0], [%[x_ptr]]\n"
- "add %[a_ptr], %[a_ptr], #128\n"
- "add %[x_ptr], %[x_ptr], #4\n"
-
- "cbz %w[l], 2f\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr %s[x0], [%[x_ptr]]\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+
+ "cbz %w[l], 2f\n"
"1:\n"
- "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
- "ldr q2, [%[a_ptr], #0]\n"
- "subs %w[l], %w[l], #1\n"
- "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
- "ldr q3, [%[a_ptr], #16]\n"
- "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
- "ldr q4, [%[a_ptr], #32]\n"
- "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
- "ldr q5, [%[a_ptr], #48]\n"
- "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
- "ldr q6, [%[a_ptr], #64]\n"
- "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
- "ldr q7, [%[a_ptr], #80]\n"
- "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
- "ldr q8, [%[a_ptr], #96]\n"
- "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
- "ldr q9, [%[a_ptr], #112]\n"
- "ldr %s[x0], [%[x_ptr]]\n"
- "add %[a_ptr], %[a_ptr], #128\n"
- "add %[x_ptr], %[x_ptr], #4\n"
- "bne 1b\n"
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "subs %w[l], %w[l], #1\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr %s[x0], [%[x_ptr]]\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+ "bne 1b\n"
"2:\n"
- "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
- "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
- "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
- "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
- "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
- "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
- "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
- "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
:
[a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr),
[x0] "+w" (x0), [l] "+r" (l),
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
index b8b93bf31f..d072470939 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
@@ -72,7 +72,7 @@ public:
return true;
}
- StdTransformsFixed<operand_type, operand_type, result_type, 6, 4, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 6, 4, 1> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_fp32_mla_6x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
index 72f517fe35..94312be08a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
@@ -72,7 +72,7 @@ public:
return true;
}
- StdTransformsFixed<operand_type, operand_type, result_type, 8, 4, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_fp32_mla_8x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
index 6fdca066b1..d244aecc70 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
@@ -73,7 +73,7 @@ public:
return false;
}
- StdTransformsFixed<operand_type, operand_type, result_type, 6, 4, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_s8s32_dot_6x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
index 27d3e2310c..85583c46cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
@@ -73,7 +73,7 @@ public:
return false;
}
- StdTransformsFixed<operand_type, operand_type, result_type, 8, 4, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_s8s32_dot_8x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
index 082ca66e01..c474b9c1f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
@@ -73,7 +73,7 @@ public:
return false;
}
- StdTransformsFixed<operand_type, operand_type, result_type, 6, 4, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_u8u32_dot_6x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
index 866b97a316..65a2281638 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
@@ -73,7 +73,7 @@ public:
return false;
}
- StdTransformsFixed<operand_type, operand_type, result_type, 8, 4, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_u8u32_dot_8x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
index b102e1dea4..5c9a465817 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
@@ -64,21 +64,21 @@ void sme2_gemv_bf16fp32_dot_16VL (
__asm__ __volatile__(
"ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
- "mov x9, #0x0\n"
"cntw x28, ALL, MUL #4\n"
- "mov x27, %x[B_ptr]\n"
- "add x26, %x[N], x28\n"
- "mov x25, %x[output_ptr]\n"
- "sub x26, x26, #0x1\n"
- "ptrue p1.b\n"
- "udiv x26, x26, x28\n"
- ".inst 0x25207811 // ptrue pn9.b\n"
- "add x22, x26, #0x3\n"
- "mov x21, #0x1\n"
+ "add x27, %x[N], x28\n"
+ "sub x27, x27, #0x1\n"
+ "udiv x27, x27, x28\n"
+ "add x22, x27, #0x3\n"
"and x22, x22, #0xfffffffffffffffc\n"
"mul x22, x22, x28\n"
"mul x22, x22, %x[K]\n"
+ "mov x9, #0x0\n"
+ "mov x26, %x[B_ptr]\n"
+ "mov x25, %x[output_ptr]\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
"lsl x22, x22, #0x1\n"
+ "mov x21, #0x1\n"
"1:" // RHS size check loop
"cmp x22, #0x200000\n"
"blt 2f\n"
@@ -92,13 +92,13 @@ void sme2_gemv_bf16fp32_dot_16VL (
"lsl x21, x21, #0x16\n"
"orr x22, x22, x20\n"
"orr x22, x22, x21\n"
- ".inst 0xf8b64b7a // rprfm pldonce, x22, [x27]\n"
+ ".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n"
"3:" // RHS prefetch exit
"mov x24, %x[bias]\n"
"4:" // Column loop
- "cmp x26, #0x4\n"
+ "cmp x27, #0x4\n"
"bge 28f\n"
- "cmp x26, #0x2\n"
+ "cmp x27, #0x2\n"
"bgt 20f\n"
"beq 12f\n"
"mov x23, %x[A_ptr]\n"
@@ -108,8 +108,8 @@ void sme2_gemv_bf16fp32_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 5f\n"
- ".inst 0xa040c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -118,59 +118,59 @@ void sme2_gemv_bf16fp32_dot_16VL (
"ble 8f\n"
"7:" // Width 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "ld1rqh { z8.h }, p0/Z, [x23]\n"
"sub x22, x22, #0x8\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[0]\n"
+ "addvl x26, x26, #16\n"
"cmp x22, #0x8\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc152b398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z2.h[0]\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc152b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z2.h[1]\n"
- ".inst 0xc152bb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z2.h[2]\n"
- ".inst 0xc152bd98 // bfdot za.s[x9, 0], { z12.h-z15.h }, z2.h[3]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b498 // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[1]\n"
+ "addvl x26, x26, #16\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158bb98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z8.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158bf18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z8.h[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 7b\n"
"8:" // Width 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
"subs x22, x22, #0x2\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[0]\n"
+ ".inst 0xc15bb398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b498 // bfdot za.s[x9, 0], { z4.h-z7.h }, z3.h[1]\n"
+ ".inst 0xc15bb598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[2]\n"
+ ".inst 0xc15bba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z3.h[3]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bbc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
+ "addvl x26, x26, #16\n"
"9:" // Width 1: Multiply loop: multiply skip
"tbz %x[flags], #1, 10f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
- "ld1rw { z23.s }, p1/Z, [x21]\n"
- "ld1rw { z22.s }, p1/Z, [x20]\n"
- ".inst 0xc1b6cae0 // fclamp { z0.s-z3.s }, z23.s, z22.s\n"
- ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z3.s }, p1/Z, [x21]\n"
+ "ld1rw { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc1bdc868 // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
+ ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"b 11f\n"
"10:" // Width 1: No activation
- ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
- ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c32c // st1w { z12.s-z15.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"11:" // Width 1: Output done
"b 36f\n"
@@ -182,10 +182,10 @@ void sme2_gemv_bf16fp32_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 13f\n"
- ".inst 0xa040c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n"
- ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
"b 14f\n"
"13:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -194,80 +194,80 @@ void sme2_gemv_bf16fp32_dot_16VL (
"ble 16f\n"
"15:" // Width 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z9.h }, p0/Z, [x23]\n"
"sub x22, x22, #0x8\n"
- "ld1rqh { z1.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[0]\n"
"cmp x22, #0x8\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc151b398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z1.h[0]\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc151b199 // bfdot za.s[x9, 1], { z12.h-z15.h }, z1.h[0]\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc151b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z1.h[1]\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc151b719 // bfdot za.s[x9, 1], { z24.h-z27.h }, z1.h[1]\n"
- ".inst 0xc151bb98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z1.h[2]\n"
- ".inst 0xc151b999 // bfdot za.s[x9, 1], { z12.h-z15.h }, z1.h[2]\n"
- ".inst 0xc151bd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z1.h[3]\n"
- ".inst 0xc151be99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z1.h[3]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159b099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[1]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159bb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z9.h[2]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159b819 // bfdot za.s[x9, 1], { z0.h-z3.h }, z9.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159bc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z9.h[3]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159bf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z9.h[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 15b\n"
"16:" // Width 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
"subs x22, x22, #0x2\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z3.h[0]\n"
- ".inst 0xc153b399 // bfdot za.s[x9, 1], { z28.h-z31.h }, z3.h[0]\n"
+ ".inst 0xc15bb198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
+ "addvl x26, x26, #16\n"
"ble 17f\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z3.h[1]\n"
- ".inst 0xc153b619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z3.h[1]\n"
+ ".inst 0xc15bb718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb419 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[1]\n"
+ "addvl x26, x26, #16\n"
"ble 17f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z3.h[2]\n"
- ".inst 0xc153ba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z3.h[2]\n"
+ ".inst 0xc15bb998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
+ "addvl x26, x26, #16\n"
"ble 17f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z3.h[3]\n"
- ".inst 0xc153bd99 // bfdot za.s[x9, 1], { z12.h-z15.h }, z3.h[3]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbe99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
+ "addvl x26, x26, #16\n"
"17:" // Width 2: Multiply loop: multiply skip
"tbz %x[flags], #1, 18f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
- "ld1rw { z23.s }, p1/Z, [x20]\n"
- ".inst 0xc1b7ca24 // fclamp { z4.s-z7.s }, z17.s, z23.s\n"
- ".inst 0xc1b7ca28 // fclamp { z8.s-z11.s }, z17.s, z23.s\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
- ".inst 0xa061c328 // st1w { z8.s-z11.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ "ld1rw { z9.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ "ld1rw { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1a8c920 // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc1a8c924 // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+ ".inst 0xa061c324 // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"b 19f\n"
"18:" // Width 2: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xa061c330 // st1w { z16.s-z19.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"19:" // Width 2: Output done
"b 36f\n"
@@ -280,12 +280,12 @@ void sme2_gemv_bf16fp32_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 21f\n"
- ".inst 0xa040c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042d00 // mova za.d[x9, #0], { z8.d-z11.d }\n"
- ".inst 0xc0042c01 // mova za.d[x9, #1], { z0.d-z3.d }\n"
- ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
+ ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
"b 22f\n"
"21:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -294,101 +294,101 @@ void sme2_gemv_bf16fp32_dot_16VL (
"ble 24f\n"
"23:" // Width 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z15.h }, p0/Z, [x23]\n"
"sub x22, x22, #0x8\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fb018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[0]\n"
"cmp x22, #0x8\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[0]\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc153b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z3.h[0]\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153b09a // bfdot za.s[x9, 2], { z4.h-z7.h }, z3.h[0]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc153b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z3.h[1]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153b699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z3.h[1]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z3.h[1]\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153b998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[2]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z3.h[2]\n"
- ".inst 0xc153b91a // bfdot za.s[x9, 2], { z8.h-z11.h }, z3.h[2]\n"
- ".inst 0xc153bc98 // bfdot za.s[x9, 0], { z4.h-z7.h }, z3.h[3]\n"
- ".inst 0xc153be99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z3.h[3]\n"
- ".inst 0xc153bf1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z3.h[3]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fb698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb51a // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[2]\n"
+ ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb919 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[2]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fbe19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbd1a // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 23b\n"
"24:" // Width 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
"subs x22, x22, #0x2\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z3.h[0]\n"
- ".inst 0xc153b199 // bfdot za.s[x9, 1], { z12.h-z15.h }, z3.h[0]\n"
- ".inst 0xc153b09a // bfdot za.s[x9, 2], { z4.h-z7.h }, z3.h[0]\n"
+ ".inst 0xc15bb398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
+ ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z11.h[0]\n"
+ "addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z3.h[1]\n"
- ".inst 0xc153b519 // bfdot za.s[x9, 1], { z8.h-z11.h }, z3.h[1]\n"
- ".inst 0xc153b61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z3.h[1]\n"
+ ".inst 0xc15bb598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z11.h[1]\n"
+ ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb79a // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[1]\n"
+ "addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bb98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z3.h[2]\n"
- ".inst 0xc153bb19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z3.h[2]\n"
- ".inst 0xc153b99a // bfdot za.s[x9, 2], { z12.h-z15.h }, z3.h[2]\n"
+ ".inst 0xc15bb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n"
+ ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb99a // bfdot za.s[x9, 2], { z12.h-z15.h }, z11.h[2]\n"
+ "addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z3.h[3]\n"
- ".inst 0xc153bf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z3.h[3]\n"
- ".inst 0xc153bd9a // bfdot za.s[x9, 2], { z12.h-z15.h }, z3.h[3]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bbd98 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[3]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbe99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
+ "addvl x26, x26, #16\n"
"25:" // Width 3: Multiply loop: multiply skip
"tbz %x[flags], #1, 26f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- "ld1rw { z21.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- "ld1rw { z20.s }, p1/Z, [x20]\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ "ld1rw { z17.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+ "ld1rw { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+ ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"b 27f\n"
"26:" // Width 3: No activation
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c5c // mova { z28.d-z31.d }, za.d[x9, #2]\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c33c // st1w { z28.s-z31.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"27:" // Width 3: Output done
"b 36f\n"
@@ -402,14 +402,14 @@ void sme2_gemv_bf16fp32_dot_16VL (
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 29f\n"
".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- "addvl x24, x24, #16\n"
- ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
- ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n"
+ ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n"
+ ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x24, x24, #16\n"
"b 30f\n"
"29:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -418,125 +418,125 @@ void sme2_gemv_bf16fp32_dot_16VL (
"ble 32f\n"
"31:" // Width 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z8.h }, p0/Z, [x23]\n"
"sub x22, x22, #0x8\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z8.h[0]\n"
"cmp x22, #0x8\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b299 // bfdot za.s[x9, 1], { z20.h-z23.h }, z3.h[0]\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc153b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z3.h[0]\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153b11b // bfdot za.s[x9, 3], { z8.h-z11.h }, z3.h[0]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153b718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z3.h[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b599 // bfdot za.s[x9, 1], { z12.h-z15.h }, z3.h[1]\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc153b51a // bfdot za.s[x9, 2], { z8.h-z11.h }, z3.h[1]\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153b49b // bfdot za.s[x9, 3], { z4.h-z7.h }, z3.h[1]\n"
- ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153bb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z3.h[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b919 // bfdot za.s[x9, 1], { z8.h-z11.h }, z3.h[2]\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc153ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z3.h[2]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153b89b // bfdot za.s[x9, 3], { z4.h-z7.h }, z3.h[2]\n"
- ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153bd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z3.h[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z3.h[3]\n"
- ".inst 0xc153bd9a // bfdot za.s[x9, 2], { z12.h-z15.h }, z3.h[3]\n"
- ".inst 0xc153be9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z3.h[3]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158b199 // bfdot za.s[x9, 1], { z12.h-z15.h }, z8.h[0]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[0]\n"
+ ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158b19b // bfdot za.s[x9, 3], { z12.h-z15.h }, z8.h[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z8.h[1]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158b699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z8.h[1]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158b61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[1]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158ba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[2]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[2]\n"
+ ".inst 0xa043a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158b81b // bfdot za.s[x9, 3], { z0.h-z3.h }, z8.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158be98 // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[3]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158be19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[3]\n"
+ ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158bc9a // bfdot za.s[x9, 2], { z4.h-z7.h }, z8.h[3]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158be9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 31b\n"
"32:" // Width 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
"subs x22, x22, #0x2\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153b218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z3.h[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b199 // bfdot za.s[x9, 1], { z12.h-z15.h }, z3.h[0]\n"
- ".inst 0xc153b39a // bfdot za.s[x9, 2], { z28.h-z31.h }, z3.h[0]\n"
- ".inst 0xc153b29b // bfdot za.s[x9, 3], { z20.h-z23.h }, z3.h[0]\n"
+ ".inst 0xc15bb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb299 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[0]\n"
+ ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb39a // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[0]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15bb21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[0]\n"
+ "addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z3.h[1]\n"
- ".inst 0xc153b71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z3.h[1]\n"
- ".inst 0xc153b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z3.h[1]\n"
+ ".inst 0xc15bb418 // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[1]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[1]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15bb61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[1]\n"
+ "addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153b998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153b919 // bfdot za.s[x9, 1], { z8.h-z11.h }, z3.h[2]\n"
- ".inst 0xc153ba9a // bfdot za.s[x9, 2], { z20.h-z23.h }, z3.h[2]\n"
- ".inst 0xc153ba1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z3.h[2]\n"
+ ".inst 0xc15bba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[2]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[2]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15bba1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[2]\n"
+ "addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153bd98 // bfdot za.s[x9, 0], { z12.h-z15.h }, z3.h[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z3.h[3]\n"
- ".inst 0xc153be9a // bfdot za.s[x9, 2], { z20.h-z23.h }, z3.h[3]\n"
- ".inst 0xc153be1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z3.h[3]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+ ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15bbe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[3]\n"
+ "addvl x26, x26, #16\n"
"33:" // Width 4: Multiply loop: multiply skip
"tbz %x[flags], #1, 34f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
"ld1rw { z21.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
"ld1rw { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n"
- ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
- ".inst 0xa061c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x8, MUL VL]\n"
".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"b 35f\n"
"34:" // Width 4: No activation
".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
- ".inst 0xa061c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
".inst 0xa063c324 // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"35:" // Width 4: Output done
- "subs x26, x26, #0x4\n"
+ "subs x27, x27, #0x4\n"
"sub %x[N], %x[N], x28, LSL #2\n"
"bgt 4b\n"
"36:" // Exit
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL.hpp
deleted file mode 100644
index a473be77f1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-#include "../std_transforms_sme.hpp"
-
-#define ARGLIST \
- const __fp16 *, const __fp16 *, \
- __fp16 *, size_t, size_t, \
- const __fp16 *, Activation, bool
-
-namespace arm_gemm
-{
-void sme2_gemv_fp16_mla_16VL( ARGLIST );
-
-class cls_sme2_gemv_fp16_mla_16VL
-{
-public:
- typedef __fp16 operand_type;
- typedef __fp16 result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- static unsigned int out_width()
- {
- return sme::get_vector_length<__fp16>() * 16;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 1;
- }
-
- static constexpr bool supports_accumulate()
- {
- return false;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
-
- StdTransformsSME<operand_type, result_type, 1, 16, 1> transforms = {};
-
-
- // Default to the generic kernel
- kern_type kernel=sme2_gemv_fp16_mla_16VL;
- cls_sme2_gemv_fp16_mla_16VL(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL/generic.cpp
deleted file mode 100644
index 4d18cc4670..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16_mla_16VL/generic.cpp
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <limits>
-
-namespace arm_gemm {
-
-void sme2_gemv_fp16_mla_16VL (
- const __fp16 *A_ptr, const __fp16 *B_ptr, __fp16 *output_ptr,
- size_t N, size_t K,
- const __fp16 *bias, Activation act, bool
-)
-{
- struct KernelArgs {
- __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
- __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
- const __fp16 *B_ptr = {};
- size_t output_offset = {};
- unsigned int input_initial_col = {};
- } ka;
-
- unsigned long flags=0;
- ka.B_ptr = B_ptr;
- switch(act.type) {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- ka.maxval = static_cast<__fp16>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- ka.minval = 0;
- flags |= 0x2;
- break;
- }
- __asm__ __volatile__(
- "ptrue p8.b\n"
- ".inst 0xd503477f // SMSTART ZA\n"
- "mov x9, #0x0\n"
- "cnth x28, ALL, MUL #4\n"
- "mov x27, %x[B_ptr]\n"
- "add x26, %x[N], x28\n"
- "mov x25, %x[output_ptr]\n"
- "sub x26, x26, #0x1\n"
- "ptrue p1.b\n"
- "udiv x26, x26, x28\n"
- ".inst 0x25207811 // ptrue pn9.b\n"
- "add x22, x26, #0x3\n"
- "mov x21, #0x1\n"
- "and x22, x22, #0xfffffffffffffffc\n"
- "mul x22, x22, x28\n"
- "mul x22, x22, %x[K]\n"
- "lsl x22, x22, #0x1\n"
- "1:" // RHS size check loop
- "cmp x22, #0x200000\n"
- "blt 2f\n"
- "tbnz x22, #0, 3f\n"
- "lsr x22, x22, #0x1\n"
- "lsl x21, x21, #0x1\n"
- "b 1b\n"
- "2:" // RHS do prefetch
- "lsl x20, x22, #0x26\n"
- "sub x21, x21, #0x1\n"
- "lsl x21, x21, #0x16\n"
- "orr x22, x22, x20\n"
- "orr x22, x22, x21\n"
- ".inst 0xf8b64b7a // rprfm pldonce, x22, [x27]\n"
- "3:" // RHS prefetch exit
- "mov x24, %x[bias]\n"
- "4:" // Column loop
- "cmp x26, #0x4\n"
- "bge 28f\n"
- "cmp x26, #0x2\n"
- "bgt 20f\n"
- "beq 12f\n"
- "mov x23, %x[A_ptr]\n"
- "lsl x21, %x[K], #0x1\n"
- "mov x20, %x[N]\n"
- "mov x22, %x[K]\n"
- ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
- ".inst 0x257467f0 // whilelt p8.h, XZR, x20, VLx4\n"
- "cbz x24, 5f\n"
- ".inst 0xa040a708 // ld1h { z8.h-z11.h }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042d00 // mova za.d[x9, #0], { z8.d-z11.d }\n"
- "b 6f\n"
- "5:" // Width 1: no bias
- ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
- "6:" // Width 1: setup done
- "cmp x22, #0x8\n"
- "ble 8f\n"
- "7:" // Width 1: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
- "sub x22, x22, #0x8\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- "cmp x22, #0x8\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b280 // fmla za.h[x9, 0], { z20.h-z23.h }, z3.h[0]\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b388 // fmla za.h[x9, 0], { z28.h-z31.h }, z3.h[1]\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b500 // fmla za.h[x9, 0], { z8.h-z11.h }, z3.h[2]\n"
- ".inst 0xc113b488 // fmla za.h[x9, 0], { z4.h-z7.h }, z3.h[3]\n"
- ".inst 0xc113ba80 // fmla za.h[x9, 0], { z20.h-z23.h }, z3.h[4]\n"
- ".inst 0xc113b988 // fmla za.h[x9, 0], { z12.h-z15.h }, z3.h[5]\n"
- ".inst 0xc113bf80 // fmla za.h[x9, 0], { z28.h-z31.h }, z3.h[6]\n"
- ".inst 0xc113be08 // fmla za.h[x9, 0], { z16.h-z19.h }, z3.h[7]\n"
- "bgt 7b\n"
- "8:" // Width 1: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "ld1rqh { z15.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb280 // fmla za.h[x9, 0], { z20.h-z23.h }, z15.h[0]\n"
- "ble 9f\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb208 // fmla za.h[x9, 0], { z16.h-z19.h }, z15.h[1]\n"
- "ble 9f\n"
- ".inst 0xa040a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb400 // fmla za.h[x9, 0], { z0.h-z3.h }, z15.h[2]\n"
- "ble 9f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb788 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
- "ble 9f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb880 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[4]\n"
- "ble 9f\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fba08 // fmla za.h[x9, 0], { z16.h-z19.h }, z15.h[5]\n"
- "ble 9f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbc80 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[6]\n"
- "ble 9f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbf88 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[7]\n"
- "9:" // Width 1: Multiply loop: multiply skip
- "tbz %x[flags], #1, 10f\n"
- "add x21, %x[args_ptr], %[offset_min]\n"
- "add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rh { z4.h }, p1/Z, [x21]\n"
- "ld1rh { z21.h }, p1/Z, [x20]\n"
- ".inst 0xc175c888 // fclamp { z8.h-z11.h }, z4.h, z21.h\n"
- ".inst 0xa060a328 // st1h { z8.h-z11.h }, p8, [x25]\n"
- "addvl x25, x25, #4\n"
- "b 11f\n"
- "10:" // Width 1: No activation
- ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
- ".inst 0xa060a320 // st1h { z0.h-z3.h }, p8, [x25]\n"
- "addvl x25, x25, #4\n"
- "11:" // Width 1: Output done
- "b 36f\n"
- "12:" // Width 2
- "mov x23, %x[A_ptr]\n"
- "lsl x21, %x[K], #0x1\n"
- "sub x20, %x[N], x28\n"
- "mov x22, %x[K]\n"
- ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
- ".inst 0x257467f0 // whilelt p8.h, XZR, x20, VLx4\n"
- "cbz x24, 13f\n"
- ".inst 0xa040a71c // ld1h { z28.h-z31.h }, pn9.b/Z, [x24]\n"
- ".inst 0xa041a70c // ld1h { z12.h-z15.h }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n"
- ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
- "b 14f\n"
- "13:" // Width 2: no bias
- ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
- "14:" // Width 2: setup done
- "cmp x22, #0x8\n"
- "ble 16f\n"
- "15:" // Width 2: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "sub x22, x22, #0x8\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
- "cmp x22, #0x8\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc113b100 // fmla za.h[x9, 0], { z8.h-z11.h }, z3.h[0]\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b301 // fmla za.h[x9, 1], { z24.h-z27.h }, z3.h[0]\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b288 // fmla za.h[x9, 0], { z20.h-z23.h }, z3.h[1]\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc113b209 // fmla za.h[x9, 1], { z16.h-z19.h }, z3.h[1]\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc113b500 // fmla za.h[x9, 0], { z8.h-z11.h }, z3.h[2]\n"
- ".inst 0xa041a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b581 // fmla za.h[x9, 1], { z12.h-z15.h }, z3.h[2]\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b688 // fmla za.h[x9, 0], { z20.h-z23.h }, z3.h[3]\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc113b709 // fmla za.h[x9, 1], { z24.h-z27.h }, z3.h[3]\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc113bb80 // fmla za.h[x9, 0], { z28.h-z31.h }, z3.h[4]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc113b881 // fmla za.h[x9, 1], { z4.h-z7.h }, z3.h[4]\n"
- ".inst 0xc113b988 // fmla za.h[x9, 0], { z12.h-z15.h }, z3.h[5]\n"
- ".inst 0xc113ba09 // fmla za.h[x9, 1], { z16.h-z19.h }, z3.h[5]\n"
- ".inst 0xc113bd00 // fmla za.h[x9, 0], { z8.h-z11.h }, z3.h[6]\n"
- ".inst 0xc113be81 // fmla za.h[x9, 1], { z20.h-z23.h }, z3.h[6]\n"
- ".inst 0xc113bf08 // fmla za.h[x9, 0], { z24.h-z27.h }, z3.h[7]\n"
- ".inst 0xc113bf89 // fmla za.h[x9, 1], { z28.h-z31.h }, z3.h[7]\n"
- "bgt 15b\n"
- "16:" // Width 2: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "ld1rqh { z15.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb380 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[0]\n"
- ".inst 0xc11fb101 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[0]\n"
- "ble 17f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb088 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[1]\n"
- ".inst 0xc11fb289 // fmla za.h[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
- "ble 17f\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb600 // fmla za.h[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
- ".inst 0xc11fb501 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[2]\n"
- "ble 17f\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb508 // fmla za.h[x9, 0], { z8.h-z11.h }, z15.h[3]\n"
- ".inst 0xc11fb789 // fmla za.h[x9, 1], { z28.h-z31.h }, z15.h[3]\n"
- "ble 17f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb880 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[4]\n"
- ".inst 0xc11fba01 // fmla za.h[x9, 1], { z16.h-z19.h }, z15.h[4]\n"
- "ble 17f\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbb08 // fmla za.h[x9, 0], { z24.h-z27.h }, z15.h[5]\n"
- ".inst 0xc11fb909 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[5]\n"
- "ble 17f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbf80 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[6]\n"
- ".inst 0xc11fbd01 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[6]\n"
- "ble 17f\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbf08 // fmla za.h[x9, 0], { z24.h-z27.h }, z15.h[7]\n"
- ".inst 0xc11fbf89 // fmla za.h[x9, 1], { z28.h-z31.h }, z15.h[7]\n"
- "17:" // Width 2: Multiply loop: multiply skip
- "tbz %x[flags], #1, 18f\n"
- "add x21, %x[args_ptr], %[offset_min]\n"
- "add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c3c // mova { z28.d-z31.d }, za.d[x9, #1]\n"
- "ld1rh { z15.h }, p1/Z, [x21]\n"
- "ld1rh { z27.h }, p1/Z, [x20]\n"
- ".inst 0xc17bc9f0 // fclamp { z16.h-z19.h }, z15.h, z27.h\n"
- ".inst 0xc17bc9fc // fclamp { z28.h-z31.h }, z15.h, z27.h\n"
- ".inst 0xa060a730 // st1h { z16.h-z19.h }, pn9.b, [x25]\n"
- ".inst 0xa061a33c // st1h { z28.h-z31.h }, p8, [x25, #0x4, MUL VL]\n"
- "addvl x25, x25, #8\n"
- "b 19f\n"
- "18:" // Width 2: No activation
- ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c3c // mova { z28.d-z31.d }, za.d[x9, #1]\n"
- ".inst 0xa060a72c // st1h { z12.h-z15.h }, pn9.b, [x25]\n"
- ".inst 0xa061a33c // st1h { z28.h-z31.h }, p8, [x25, #0x4, MUL VL]\n"
- "addvl x25, x25, #8\n"
- "19:" // Width 2: Output done
- "b 36f\n"
- "20:" // Width 3
- "mov x20, #0x2\n"
- "mov x23, %x[A_ptr]\n"
- "lsl x21, %x[K], #0x1\n"
- "msub x20, x28, x20, %x[N]\n"
- "mov x22, %x[K]\n"
- ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
- ".inst 0x257467f0 // whilelt p8.h, XZR, x20, VLx4\n"
- "cbz x24, 21f\n"
- ".inst 0xa040a700 // ld1h { z0.h-z3.h }, pn9.b/Z, [x24]\n"
- ".inst 0xa041a710 // ld1h { z16.h-z19.h }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042a708 // ld1h { z8.h-z11.h }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
- ".inst 0xc0042d02 // mova za.d[x9, #2], { z8.d-z11.d }\n"
- "b 22f\n"
- "21:" // Width 3: no bias
- ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
- "22:" // Width 3: setup done
- "cmp x22, #0x8\n"
- "ble 24f\n"
- "23:" // Width 3: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- "sub x22, x22, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x23]\n"
- "cmp x22, #0x8\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110b180 // fmla za.h[x9, 0], { z12.h-z15.h }, z0.h[0]\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110b281 // fmla za.h[x9, 1], { z20.h-z23.h }, z0.h[0]\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110b102 // fmla za.h[x9, 2], { z8.h-z11.h }, z0.h[0]\n"
- ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110b088 // fmla za.h[x9, 0], { z4.h-z7.h }, z0.h[1]\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110b309 // fmla za.h[x9, 1], { z24.h-z27.h }, z0.h[1]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110b18a // fmla za.h[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110b680 // fmla za.h[x9, 0], { z20.h-z23.h }, z0.h[2]\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110b781 // fmla za.h[x9, 1], { z28.h-z31.h }, z0.h[2]\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110b502 // fmla za.h[x9, 2], { z8.h-z11.h }, z0.h[2]\n"
- ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110b608 // fmla za.h[x9, 0], { z16.h-z19.h }, z0.h[3]\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110b709 // fmla za.h[x9, 1], { z24.h-z27.h }, z0.h[3]\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110b48a // fmla za.h[x9, 2], { z4.h-z7.h }, z0.h[3]\n"
- ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110bb80 // fmla za.h[x9, 0], { z28.h-z31.h }, z0.h[4]\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110b901 // fmla za.h[x9, 1], { z8.h-z11.h }, z0.h[4]\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110b882 // fmla za.h[x9, 2], { z4.h-z7.h }, z0.h[4]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110b988 // fmla za.h[x9, 0], { z12.h-z15.h }, z0.h[5]\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110b909 // fmla za.h[x9, 1], { z8.h-z11.h }, z0.h[5]\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110bb0a // fmla za.h[x9, 2], { z24.h-z27.h }, z0.h[5]\n"
- ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110be80 // fmla za.h[x9, 0], { z20.h-z23.h }, z0.h[6]\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc110be01 // fmla za.h[x9, 1], { z16.h-z19.h }, z0.h[6]\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc110bd82 // fmla za.h[x9, 2], { z12.h-z15.h }, z0.h[6]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc110bc88 // fmla za.h[x9, 0], { z4.h-z7.h }, z0.h[7]\n"
- ".inst 0xc110be09 // fmla za.h[x9, 1], { z16.h-z19.h }, z0.h[7]\n"
- ".inst 0xc110be8a // fmla za.h[x9, 2], { z20.h-z23.h }, z0.h[7]\n"
- "bgt 23b\n"
- "24:" // Width 3: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "ld1rqh { z15.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb080 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[0]\n"
- ".inst 0xc11fb301 // fmla za.h[x9, 1], { z24.h-z27.h }, z15.h[0]\n"
- ".inst 0xc11fb382 // fmla za.h[x9, 2], { z28.h-z31.h }, z15.h[0]\n"
- "ble 25f\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb308 // fmla za.h[x9, 0], { z24.h-z27.h }, z15.h[1]\n"
- ".inst 0xc11fb089 // fmla za.h[x9, 1], { z4.h-z7.h }, z15.h[1]\n"
- ".inst 0xc11fb10a // fmla za.h[x9, 2], { z8.h-z11.h }, z15.h[1]\n"
- "ble 25f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb780 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[2]\n"
- ".inst 0xc11fb601 // fmla za.h[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
- ".inst 0xc11fb502 // fmla za.h[x9, 2], { z8.h-z11.h }, z15.h[2]\n"
- "ble 25f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb488 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[3]\n"
- ".inst 0xc11fb709 // fmla za.h[x9, 1], { z24.h-z27.h }, z15.h[3]\n"
- ".inst 0xc11fb60a // fmla za.h[x9, 2], { z16.h-z19.h }, z15.h[3]\n"
- "ble 25f\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb900 // fmla za.h[x9, 0], { z8.h-z11.h }, z15.h[4]\n"
- ".inst 0xc11fbb01 // fmla za.h[x9, 1], { z24.h-z27.h }, z15.h[4]\n"
- ".inst 0xc11fba02 // fmla za.h[x9, 2], { z16.h-z19.h }, z15.h[4]\n"
- "ble 25f\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb908 // fmla za.h[x9, 0], { z8.h-z11.h }, z15.h[5]\n"
- ".inst 0xc11fba09 // fmla za.h[x9, 1], { z16.h-z19.h }, z15.h[5]\n"
- ".inst 0xc11fbb0a // fmla za.h[x9, 2], { z24.h-z27.h }, z15.h[5]\n"
- "ble 25f\n"
- ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbd00 // fmla za.h[x9, 0], { z8.h-z11.h }, z15.h[6]\n"
- ".inst 0xc11fbe81 // fmla za.h[x9, 1], { z20.h-z23.h }, z15.h[6]\n"
- ".inst 0xc11fbf02 // fmla za.h[x9, 2], { z24.h-z27.h }, z15.h[6]\n"
- "ble 25f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbf88 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[7]\n"
- ".inst 0xc11fbd09 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[7]\n"
- ".inst 0xc11fbe8a // fmla za.h[x9, 2], { z20.h-z23.h }, z15.h[7]\n"
- "25:" // Width 3: Multiply loop: multiply skip
- "tbz %x[flags], #1, 26f\n"
- "add x21, %x[args_ptr], %[offset_min]\n"
- "add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
- "ld1rh { z17.h }, p1/Z, [x21]\n"
- ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
- "ld1rh { z16.h }, p1/Z, [x20]\n"
- ".inst 0xc170ca2c // fclamp { z12.h-z15.h }, z17.h, z16.h\n"
- ".inst 0xc170ca20 // fclamp { z0.h-z3.h }, z17.h, z16.h\n"
- ".inst 0xa060a72c // st1h { z12.h-z15.h }, pn9.b, [x25]\n"
- ".inst 0xc170ca24 // fclamp { z4.h-z7.h }, z17.h, z16.h\n"
- ".inst 0xa061a720 // st1h { z0.h-z3.h }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062a324 // st1h { z4.h-z7.h }, p8, [x25, #0x8, MUL VL]\n"
- "addvl x25, x25, #12\n"
- "b 27f\n"
- "26:" // Width 3: No activation
- ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
- ".inst 0xa060a730 // st1h { z16.h-z19.h }, pn9.b, [x25]\n"
- ".inst 0xa061a738 // st1h { z24.h-z27.h }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062a334 // st1h { z20.h-z23.h }, p8, [x25, #0x8, MUL VL]\n"
- "addvl x25, x25, #12\n"
- "27:" // Width 3: Output done
- "b 36f\n"
- "28:" // Width 4
- "mov x20, #0x3\n"
- "mov x23, %x[A_ptr]\n"
- "lsl x21, %x[K], #0x1\n"
- "msub x20, x28, x20, %x[N]\n"
- "mov x22, %x[K]\n"
- ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
- ".inst 0x257467f0 // whilelt p8.h, XZR, x20, VLx4\n"
- "cbz x24, 29f\n"
- ".inst 0xa040a714 // ld1h { z20.h-z23.h }, pn9.b/Z, [x24]\n"
- ".inst 0xa041a704 // ld1h { z4.h-z7.h }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042a708 // ld1h { z8.h-z11.h }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xa043a700 // ld1h { z0.h-z3.h }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
- ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
- "addvl x24, x24, #16\n"
- ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
- ".inst 0xc0042d02 // mova za.d[x9, #2], { z8.d-z11.d }\n"
- ".inst 0xc0042c03 // mova za.d[x9, #3], { z0.d-z3.d }\n"
- "b 30f\n"
- "29:" // Width 4: no bias
- ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
- "30:" // Width 4: setup done
- "cmp x22, #0x8\n"
- "ble 32f\n"
- "31:" // Width 4: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- "sub x22, x22, #0x8\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "cmp x22, #0x8\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114b300 // fmla za.h[x9, 0], { z24.h-z27.h }, z4.h[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114b181 // fmla za.h[x9, 1], { z12.h-z15.h }, z4.h[0]\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114b382 // fmla za.h[x9, 2], { z28.h-z31.h }, z4.h[0]\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114b103 // fmla za.h[x9, 3], { z8.h-z11.h }, z4.h[0]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114b208 // fmla za.h[x9, 0], { z16.h-z19.h }, z4.h[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114b189 // fmla za.h[x9, 1], { z12.h-z15.h }, z4.h[1]\n"
- ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114b30a // fmla za.h[x9, 2], { z24.h-z27.h }, z4.h[1]\n"
- ".inst 0xa041a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114b10b // fmla za.h[x9, 3], { z8.h-z11.h }, z4.h[1]\n"
- ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114b580 // fmla za.h[x9, 0], { z12.h-z15.h }, z4.h[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114b401 // fmla za.h[x9, 1], { z0.h-z3.h }, z4.h[2]\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114b702 // fmla za.h[x9, 2], { z24.h-z27.h }, z4.h[2]\n"
- ".inst 0xa041a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114b503 // fmla za.h[x9, 3], { z8.h-z11.h }, z4.h[2]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114b788 // fmla za.h[x9, 0], { z28.h-z31.h }, z4.h[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114b409 // fmla za.h[x9, 1], { z0.h-z3.h }, z4.h[3]\n"
- ".inst 0xa040a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114b50a // fmla za.h[x9, 2], { z8.h-z11.h }, z4.h[3]\n"
- ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114b58b // fmla za.h[x9, 3], { z12.h-z15.h }, z4.h[3]\n"
- ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114b800 // fmla za.h[x9, 0], { z0.h-z3.h }, z4.h[4]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114bb01 // fmla za.h[x9, 1], { z24.h-z27.h }, z4.h[4]\n"
- ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114b982 // fmla za.h[x9, 2], { z12.h-z15.h }, z4.h[4]\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114b903 // fmla za.h[x9, 3], { z8.h-z11.h }, z4.h[4]\n"
- ".inst 0xa042a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114ba88 // fmla za.h[x9, 0], { z20.h-z23.h }, z4.h[5]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114ba09 // fmla za.h[x9, 1], { z16.h-z19.h }, z4.h[5]\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114b80a // fmla za.h[x9, 2], { z0.h-z3.h }, z4.h[5]\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114b98b // fmla za.h[x9, 3], { z12.h-z15.h }, z4.h[5]\n"
- ".inst 0xa042a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114bf00 // fmla za.h[x9, 0], { z24.h-z27.h }, z4.h[6]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114be01 // fmla za.h[x9, 1], { z16.h-z19.h }, z4.h[6]\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- ".inst 0xc114bc02 // fmla za.h[x9, 2], { z0.h-z3.h }, z4.h[6]\n"
- ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc114bd03 // fmla za.h[x9, 3], { z8.h-z11.h }, z4.h[6]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc114be08 // fmla za.h[x9, 0], { z16.h-z19.h }, z4.h[7]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc114bd89 // fmla za.h[x9, 1], { z12.h-z15.h }, z4.h[7]\n"
- ".inst 0xc114be8a // fmla za.h[x9, 2], { z20.h-z23.h }, z4.h[7]\n"
- ".inst 0xc114bd0b // fmla za.h[x9, 3], { z8.h-z11.h }, z4.h[7]\n"
- "bgt 31b\n"
- "32:" // Width 4: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x22\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- "ld1rqh { z15.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fb080 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb201 // fmla za.h[x9, 1], { z16.h-z19.h }, z15.h[0]\n"
- ".inst 0xc11fb102 // fmla za.h[x9, 2], { z8.h-z11.h }, z15.h[0]\n"
- ".inst 0xc11fb003 // fmla za.h[x9, 3], { z0.h-z3.h }, z15.h[0]\n"
- "ble 33f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fb088 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb389 // fmla za.h[x9, 1], { z28.h-z31.h }, z15.h[1]\n"
- ".inst 0xc11fb10a // fmla za.h[x9, 2], { z8.h-z11.h }, z15.h[1]\n"
- ".inst 0xc11fb20b // fmla za.h[x9, 3], { z16.h-z19.h }, z15.h[1]\n"
- "ble 33f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fb480 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb501 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[2]\n"
- ".inst 0xc11fb682 // fmla za.h[x9, 2], { z20.h-z23.h }, z15.h[2]\n"
- ".inst 0xc11fb603 // fmla za.h[x9, 3], { z16.h-z19.h }, z15.h[2]\n"
- "ble 33f\n"
- ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fb788 // fmla za.h[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fb509 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
- ".inst 0xc11fb48a // fmla za.h[x9, 2], { z4.h-z7.h }, z15.h[3]\n"
- ".inst 0xc11fb40b // fmla za.h[x9, 3], { z0.h-z3.h }, z15.h[3]\n"
- "ble 33f\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fbb00 // fmla za.h[x9, 0], { z24.h-z27.h }, z15.h[4]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fba81 // fmla za.h[x9, 1], { z20.h-z23.h }, z15.h[4]\n"
- ".inst 0xc11fba02 // fmla za.h[x9, 2], { z16.h-z19.h }, z15.h[4]\n"
- ".inst 0xc11fb883 // fmla za.h[x9, 3], { z4.h-z7.h }, z15.h[4]\n"
- "ble 33f\n"
- ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fba08 // fmla za.h[x9, 0], { z16.h-z19.h }, z15.h[5]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbb89 // fmla za.h[x9, 1], { z28.h-z31.h }, z15.h[5]\n"
- ".inst 0xc11fba8a // fmla za.h[x9, 2], { z20.h-z23.h }, z15.h[5]\n"
- ".inst 0xc11fb88b // fmla za.h[x9, 3], { z4.h-z7.h }, z15.h[5]\n"
- "ble 33f\n"
- ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fbc80 // fmla za.h[x9, 0], { z4.h-z7.h }, z15.h[6]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbd01 // fmla za.h[x9, 1], { z8.h-z11.h }, z15.h[6]\n"
- ".inst 0xc11fbe82 // fmla za.h[x9, 2], { z20.h-z23.h }, z15.h[6]\n"
- ".inst 0xc11fbe03 // fmla za.h[x9, 3], { z16.h-z19.h }, z15.h[6]\n"
- "ble 33f\n"
- ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc11fbf08 // fmla za.h[x9, 0], { z24.h-z27.h }, z15.h[7]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc11fbf89 // fmla za.h[x9, 1], { z28.h-z31.h }, z15.h[7]\n"
- ".inst 0xc11fbc8a // fmla za.h[x9, 2], { z4.h-z7.h }, z15.h[7]\n"
- ".inst 0xc11fbe8b // fmla za.h[x9, 3], { z20.h-z23.h }, z15.h[7]\n"
- "33:" // Width 4: Multiply loop: multiply skip
- "tbz %x[flags], #1, 34f\n"
- "add x21, %x[args_ptr], %[offset_min]\n"
- "add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- "ld1rh { z17.h }, p1/Z, [x21]\n"
- ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
- "ld1rh { z16.h }, p1/Z, [x20]\n"
- ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n"
- ".inst 0xc170ca24 // fclamp { z4.h-z7.h }, z17.h, z16.h\n"
- ".inst 0xc170ca2c // fclamp { z12.h-z15.h }, z17.h, z16.h\n"
- ".inst 0xc170ca20 // fclamp { z0.h-z3.h }, z17.h, z16.h\n"
- ".inst 0xa060a724 // st1h { z4.h-z7.h }, pn9.b, [x25]\n"
- ".inst 0xa061a72c // st1h { z12.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc170ca28 // fclamp { z8.h-z11.h }, z17.h, z16.h\n"
- ".inst 0xa062a720 // st1h { z0.h-z3.h }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xa063a328 // st1h { z8.h-z11.h }, p8, [x25, #0xc, MUL VL]\n"
- "addvl x25, x25, #16\n"
- "b 35f\n"
- "34:" // Width 4: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n"
- ".inst 0xa060a728 // st1h { z8.h-z11.h }, pn9.b, [x25]\n"
- ".inst 0xa061a72c // st1h { z12.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062a730 // st1h { z16.h-z19.h }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xa063a320 // st1h { z0.h-z3.h }, p8, [x25, #0xc, MUL VL]\n"
- "addvl x25, x25, #16\n"
- "35:" // Width 4: Output done
- "subs x26, x26, #0x4\n"
- "sub %x[N], %x[N], x28, LSL #2\n"
- "bgt 4b\n"
- "36:" // Exit
- ".inst 0xd503467f // SMSTOP\n"
- "ptrue p8.b\n"
- : [N] "+&r" (N)
- : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace arm_gemm
-
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
index aec02fa337..1cce355583 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
@@ -63,21 +63,21 @@ void sme2_gemv_fp32_mla_16VL (
__asm__ __volatile__(
"ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
- "mov x9, #0x0\n"
"cntw x28, ALL, MUL #4\n"
- "mov x27, %x[B_ptr]\n"
- "add x26, %x[N], x28\n"
- "mov x25, %x[output_ptr]\n"
- "sub x26, x26, #0x1\n"
- "ptrue p1.b\n"
- "udiv x26, x26, x28\n"
- ".inst 0x25207811 // ptrue pn9.b\n"
- "add x22, x26, #0x3\n"
- "mov x21, #0x1\n"
+ "add x27, %x[N], x28\n"
+ "sub x27, x27, #0x1\n"
+ "udiv x27, x27, x28\n"
+ "add x22, x27, #0x3\n"
"and x22, x22, #0xfffffffffffffffc\n"
"mul x22, x22, x28\n"
"mul x22, x22, %x[K]\n"
+ "mov x9, #0x0\n"
+ "mov x26, %x[B_ptr]\n"
+ "mov x25, %x[output_ptr]\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
"lsl x22, x22, #0x2\n"
+ "mov x21, #0x1\n"
"1:" // RHS size check loop
"cmp x22, #0x200000\n"
"blt 2f\n"
@@ -91,13 +91,13 @@ void sme2_gemv_fp32_mla_16VL (
"lsl x21, x21, #0x16\n"
"orr x22, x22, x20\n"
"orr x22, x22, x21\n"
- ".inst 0xf8b64b7a // rprfm pldonce, x22, [x27]\n"
+ ".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n"
"3:" // RHS prefetch exit
"mov x24, %x[bias]\n"
"4:" // Column loop
- "cmp x26, #0x4\n"
+ "cmp x27, #0x4\n"
"bge 28f\n"
- "cmp x26, #0x2\n"
+ "cmp x27, #0x2\n"
"bgt 20f\n"
"beq 12f\n"
"mov x23, %x[A_ptr]\n"
@@ -107,8 +107,8 @@ void sme2_gemv_fp32_mla_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 5f\n"
- ".inst 0xa040c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -117,59 +117,59 @@ void sme2_gemv_fp32_mla_16VL (
"ble 8f\n"
"7:" // Width 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "ld1rqw { z8.s }, p0/Z, [x23]\n"
"sub x22, x22, #0x4\n"
- "add x23, x23, #0x10\n"
- ".inst 0xa040c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
+ ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a280 // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[0]\n"
+ "addvl x26, x26, #16\n"
"cmp x22, #0x4\n"
- ".inst 0xa040c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc152a380 // fmla za.s[x9, 0], { z28.s-z31.s }, z2.s[0]\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc152a600 // fmla za.s[x9, 0], { z16.s-z19.s }, z2.s[1]\n"
- ".inst 0xc152ab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z2.s[2]\n"
- ".inst 0xc152ad80 // fmla za.s[x9, 0], { z12.s-z15.s }, z2.s[3]\n"
+ ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a480 // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[1]\n"
+ "addvl x26, x26, #16\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158ab80 // fmla za.s[x9, 0], { z28.s-z31.s }, z8.s[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158af00 // fmla za.s[x9, 0], { z24.s-z27.s }, z8.s[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 7b\n"
"8:" // Width 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
"subs x22, x22, #0x1\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a180 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[0]\n"
+ ".inst 0xc15ba380 // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a480 // fmla za.s[x9, 0], { z4.s-z7.s }, z3.s[1]\n"
+ ".inst 0xc15ba580 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a980 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[2]\n"
+ ".inst 0xc15baa00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153ad00 // fmla za.s[x9, 0], { z8.s-z11.s }, z3.s[3]\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bac00 // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[3]\n"
+ "addvl x26, x26, #16\n"
"9:" // Width 1: Multiply loop: multiply skip
"tbz %x[flags], #1, 10f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
- "ld1rw { z23.s }, p1/Z, [x21]\n"
- "ld1rw { z22.s }, p1/Z, [x20]\n"
- ".inst 0xc1b6cae0 // fclamp { z0.s-z3.s }, z23.s, z22.s\n"
- ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z3.s }, p1/Z, [x21]\n"
+ "ld1rw { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc1bdc868 // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
+ ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"b 11f\n"
"10:" // Width 1: No activation
- ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
- ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c32c // st1w { z12.s-z15.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"11:" // Width 1: Output done
"b 36f\n"
@@ -181,10 +181,10 @@ void sme2_gemv_fp32_mla_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 13f\n"
- ".inst 0xa040c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n"
- ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
"b 14f\n"
"13:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -193,80 +193,80 @@ void sme2_gemv_fp32_mla_16VL (
"ble 16f\n"
"15:" // Width 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z9.s }, p0/Z, [x23]\n"
"sub x22, x22, #0x4\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159a180 // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[0]\n"
"cmp x22, #0x4\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc151a380 // fmla za.s[x9, 0], { z28.s-z31.s }, z1.s[0]\n"
- ".inst 0xa041c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc151a181 // fmla za.s[x9, 1], { z12.s-z15.s }, z1.s[0]\n"
- ".inst 0xa040c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa041c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc151a600 // fmla za.s[x9, 0], { z16.s-z19.s }, z1.s[1]\n"
- ".inst 0xa041c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc151a701 // fmla za.s[x9, 1], { z24.s-z27.s }, z1.s[1]\n"
- ".inst 0xc151ab80 // fmla za.s[x9, 0], { z28.s-z31.s }, z1.s[2]\n"
- ".inst 0xc151a981 // fmla za.s[x9, 1], { z12.s-z15.s }, z1.s[2]\n"
- ".inst 0xc151ad00 // fmla za.s[x9, 0], { z8.s-z11.s }, z1.s[3]\n"
- ".inst 0xc151ae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z1.s[3]\n"
+ ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159a081 // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[1]\n"
+ ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159a481 // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159ab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z9.s[2]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159a801 // fmla za.s[x9, 1], { z0.s-z3.s }, z9.s[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159ac00 // fmla za.s[x9, 0], { z0.s-z3.s }, z9.s[3]\n"
+ ".inst 0xa041c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159af81 // fmla za.s[x9, 1], { z28.s-z31.s }, z9.s[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 15b\n"
"16:" // Width 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
"subs x22, x22, #0x1\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a200 // fmla za.s[x9, 0], { z16.s-z19.s }, z3.s[0]\n"
- ".inst 0xc153a381 // fmla za.s[x9, 1], { z28.s-z31.s }, z3.s[0]\n"
+ ".inst 0xc15ba180 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[0]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba001 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
+ "addvl x26, x26, #16\n"
"ble 17f\n"
- ".inst 0xa040c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa041c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a680 // fmla za.s[x9, 0], { z20.s-z23.s }, z3.s[1]\n"
- ".inst 0xc153a601 // fmla za.s[x9, 1], { z16.s-z19.s }, z3.s[1]\n"
+ ".inst 0xc15ba700 // fmla za.s[x9, 0], { z24.s-z27.s }, z11.s[1]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba401 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[1]\n"
+ "addvl x26, x26, #16\n"
"ble 17f\n"
- ".inst 0xa040c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa041c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a880 // fmla za.s[x9, 0], { z4.s-z7.s }, z3.s[2]\n"
- ".inst 0xc153aa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z3.s[2]\n"
+ ".inst 0xc15ba980 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[2]\n"
+ ".inst 0xa041c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bab81 // fmla za.s[x9, 1], { z28.s-z31.s }, z11.s[2]\n"
+ "addvl x26, x26, #16\n"
"ble 17f\n"
- ".inst 0xa040c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa041c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153af80 // fmla za.s[x9, 0], { z28.s-z31.s }, z3.s[3]\n"
- ".inst 0xc153ad81 // fmla za.s[x9, 1], { z12.s-z15.s }, z3.s[3]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
+ "addvl x26, x26, #16\n"
"17:" // Width 2: Multiply loop: multiply skip
"tbz %x[flags], #1, 18f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
- "ld1rw { z23.s }, p1/Z, [x20]\n"
- ".inst 0xc1b7ca24 // fclamp { z4.s-z7.s }, z17.s, z23.s\n"
- ".inst 0xc1b7ca28 // fclamp { z8.s-z11.s }, z17.s, z23.s\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
- ".inst 0xa061c328 // st1w { z8.s-z11.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ "ld1rw { z9.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ "ld1rw { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1a8c920 // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc1a8c924 // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+ ".inst 0xa061c324 // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"b 19f\n"
"18:" // Width 2: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xa061c330 // st1w { z16.s-z19.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"19:" // Width 2: Output done
"b 36f\n"
@@ -279,12 +279,12 @@ void sme2_gemv_fp32_mla_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 21f\n"
- ".inst 0xa040c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042d00 // mova za.d[x9, #0], { z8.d-z11.d }\n"
- ".inst 0xc0042c01 // mova za.d[x9, #1], { z0.d-z3.d }\n"
- ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
+ ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
"b 22f\n"
"21:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -293,101 +293,101 @@ void sme2_gemv_fp32_mla_16VL (
"ble 24f\n"
"23:" // Width 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z15.s }, p0/Z, [x23]\n"
"sub x22, x22, #0x4\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z15.s[0]\n"
"cmp x22, #0x4\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a180 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[0]\n"
- ".inst 0xa040c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc153a101 // fmla za.s[x9, 1], { z8.s-z11.s }, z3.s[0]\n"
- ".inst 0xa041c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153a082 // fmla za.s[x9, 2], { z4.s-z7.s }, z3.s[0]\n"
- ".inst 0xa042c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc153a600 // fmla za.s[x9, 0], { z16.s-z19.s }, z3.s[1]\n"
- ".inst 0xa041c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153a681 // fmla za.s[x9, 1], { z20.s-z23.s }, z3.s[1]\n"
- ".inst 0xa042c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a702 // fmla za.s[x9, 2], { z24.s-z27.s }, z3.s[1]\n"
- ".inst 0xa040c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa041c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153a980 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[2]\n"
- ".inst 0xa042c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153ab81 // fmla za.s[x9, 1], { z28.s-z31.s }, z3.s[2]\n"
- ".inst 0xc153a902 // fmla za.s[x9, 2], { z8.s-z11.s }, z3.s[2]\n"
- ".inst 0xc153ac80 // fmla za.s[x9, 0], { z4.s-z7.s }, z3.s[3]\n"
- ".inst 0xc153ae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z3.s[3]\n"
- ".inst 0xc153af02 // fmla za.s[x9, 2], { z24.s-z27.s }, z3.s[3]\n"
+ ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z15.s[0]\n"
+ ".inst 0xa042c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fa002 // fmla za.s[x9, 2], { z0.s-z3.s }, z15.s[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fa680 // fmla za.s[x9, 0], { z20.s-z23.s }, z15.s[1]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fa681 // fmla za.s[x9, 1], { z20.s-z23.s }, z15.s[1]\n"
+ ".inst 0xa042c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fa502 // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z15.s[2]\n"
+ ".inst 0xa041c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fa901 // fmla za.s[x9, 1], { z8.s-z11.s }, z15.s[2]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15faa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z15.s[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z15.s[3]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fae01 // fmla za.s[x9, 1], { z16.s-z19.s }, z15.s[3]\n"
+ ".inst 0xa042c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fad02 // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 23b\n"
"24:" // Width 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
"subs x22, x22, #0x1\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a280 // fmla za.s[x9, 0], { z20.s-z23.s }, z3.s[0]\n"
- ".inst 0xc153a181 // fmla za.s[x9, 1], { z12.s-z15.s }, z3.s[0]\n"
- ".inst 0xc153a082 // fmla za.s[x9, 2], { z4.s-z7.s }, z3.s[0]\n"
+ ".inst 0xc15ba380 // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba001 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
+ ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba282 // fmla za.s[x9, 2], { z20.s-z23.s }, z11.s[0]\n"
+ "addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa041c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a680 // fmla za.s[x9, 0], { z20.s-z23.s }, z3.s[1]\n"
- ".inst 0xc153a501 // fmla za.s[x9, 1], { z8.s-z11.s }, z3.s[1]\n"
- ".inst 0xc153a602 // fmla za.s[x9, 2], { z16.s-z19.s }, z3.s[1]\n"
+ ".inst 0xc15ba580 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
+ ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba481 // fmla za.s[x9, 1], { z4.s-z7.s }, z11.s[1]\n"
+ ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba782 // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[1]\n"
+ "addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa041c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153ab80 // fmla za.s[x9, 0], { z28.s-z31.s }, z3.s[2]\n"
- ".inst 0xc153ab01 // fmla za.s[x9, 1], { z24.s-z27.s }, z3.s[2]\n"
- ".inst 0xc153a982 // fmla za.s[x9, 2], { z12.s-z15.s }, z3.s[2]\n"
+ ".inst 0xc15ba880 // fmla za.s[x9, 0], { z4.s-z7.s }, z11.s[2]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15baa81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[2]\n"
+ ".inst 0xa042c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba982 // fmla za.s[x9, 2], { z12.s-z15.s }, z11.s[2]\n"
+ "addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa041c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153ad00 // fmla za.s[x9, 0], { z8.s-z11.s }, z3.s[3]\n"
- ".inst 0xc153af81 // fmla za.s[x9, 1], { z28.s-z31.s }, z3.s[3]\n"
- ".inst 0xc153ad82 // fmla za.s[x9, 2], { z12.s-z15.s }, z3.s[3]\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bad80 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[3]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
+ "addvl x26, x26, #16\n"
"25:" // Width 3: Multiply loop: multiply skip
"tbz %x[flags], #1, 26f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- "ld1rw { z21.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- "ld1rw { z20.s }, p1/Z, [x20]\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ "ld1rw { z17.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+ "ld1rw { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+ ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"b 27f\n"
"26:" // Width 3: No activation
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c5c // mova { z28.d-z31.d }, za.d[x9, #2]\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c33c // st1w { z28.s-z31.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"27:" // Width 3: Output done
"b 36f\n"
@@ -401,14 +401,14 @@ void sme2_gemv_fp32_mla_16VL (
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 29f\n"
".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- "addvl x24, x24, #16\n"
- ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
- ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n"
+ ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n"
+ ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x24, x24, #16\n"
"b 30f\n"
"29:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -417,125 +417,125 @@ void sme2_gemv_fp32_mla_16VL (
"ble 32f\n"
"31:" // Width 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z8.s }, p0/Z, [x23]\n"
"sub x22, x22, #0x4\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a200 // fmla za.s[x9, 0], { z16.s-z19.s }, z8.s[0]\n"
"cmp x22, #0x4\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153a180 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a281 // fmla za.s[x9, 1], { z20.s-z23.s }, z3.s[0]\n"
- ".inst 0xa040c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc153a202 // fmla za.s[x9, 2], { z16.s-z19.s }, z3.s[0]\n"
- ".inst 0xa041c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153a103 // fmla za.s[x9, 3], { z8.s-z11.s }, z3.s[0]\n"
- ".inst 0xa042c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153a700 // fmla za.s[x9, 0], { z24.s-z27.s }, z3.s[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a581 // fmla za.s[x9, 1], { z12.s-z15.s }, z3.s[1]\n"
- ".inst 0xa040c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc153a502 // fmla za.s[x9, 2], { z8.s-z11.s }, z3.s[1]\n"
- ".inst 0xa041c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153a483 // fmla za.s[x9, 3], { z4.s-z7.s }, z3.s[1]\n"
- ".inst 0xa042c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153ab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z3.s[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a901 // fmla za.s[x9, 1], { z8.s-z11.s }, z3.s[2]\n"
- ".inst 0xa040c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27]\n"
- ".inst 0xc153aa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z3.s[2]\n"
- ".inst 0xa041c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc153a883 // fmla za.s[x9, 3], { z4.s-z7.s }, z3.s[2]\n"
- ".inst 0xa042c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153ad00 // fmla za.s[x9, 0], { z8.s-z11.s }, z3.s[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153af81 // fmla za.s[x9, 1], { z28.s-z31.s }, z3.s[3]\n"
- ".inst 0xc153ad82 // fmla za.s[x9, 2], { z12.s-z15.s }, z3.s[3]\n"
- ".inst 0xc153ae83 // fmla za.s[x9, 3], { z20.s-z23.s }, z3.s[3]\n"
+ ".inst 0xa041c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158a181 // fmla za.s[x9, 1], { z12.s-z15.s }, z8.s[0]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158a202 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[0]\n"
+ ".inst 0xa043c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158a183 // fmla za.s[x9, 3], { z12.s-z15.s }, z8.s[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z8.s[1]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158a681 // fmla za.s[x9, 1], { z20.s-z23.s }, z8.s[1]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158a602 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[1]\n"
+ ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158a683 // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a880 // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[2]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158aa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[2]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158aa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[2]\n"
+ ".inst 0xa043c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158a803 // fmla za.s[x9, 3], { z0.s-z3.s }, z8.s[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158ae80 // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[3]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158ae01 // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[3]\n"
+ ".inst 0xa042c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158ac82 // fmla za.s[x9, 2], { z4.s-z7.s }, z8.s[3]\n"
+ ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158ae83 // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[3]\n"
+ "addvl x26, x26, #16\n"
"bgt 31b\n"
"32:" // Width 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x22\n"
- ".inst 0xa040c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27]\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
"subs x22, x22, #0x1\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c77d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153a200 // fmla za.s[x9, 0], { z16.s-z19.s }, z3.s[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a181 // fmla za.s[x9, 1], { z12.s-z15.s }, z3.s[0]\n"
- ".inst 0xc153a382 // fmla za.s[x9, 2], { z28.s-z31.s }, z3.s[0]\n"
- ".inst 0xc153a283 // fmla za.s[x9, 3], { z20.s-z23.s }, z3.s[0]\n"
+ ".inst 0xc15ba200 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[0]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba281 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[0]\n"
+ ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba382 // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[0]\n"
+ ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15ba203 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[0]\n"
+ "addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa041c765 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c779 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a481 // fmla za.s[x9, 1], { z4.s-z7.s }, z3.s[1]\n"
- ".inst 0xc153a702 // fmla za.s[x9, 2], { z24.s-z27.s }, z3.s[1]\n"
- ".inst 0xc153a683 // fmla za.s[x9, 3], { z20.s-z23.s }, z3.s[1]\n"
+ ".inst 0xc15ba400 // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[1]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba601 // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[1]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba602 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[1]\n"
+ ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15ba603 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[1]\n"
+ "addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa041c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153a980 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153a901 // fmla za.s[x9, 1], { z8.s-z11.s }, z3.s[2]\n"
- ".inst 0xc153aa82 // fmla za.s[x9, 2], { z20.s-z23.s }, z3.s[2]\n"
- ".inst 0xc153aa03 // fmla za.s[x9, 3], { z16.s-z19.s }, z3.s[2]\n"
+ ".inst 0xc15baa00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15baa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[2]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15baa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[2]\n"
+ ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15baa03 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[2]\n"
+ "addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040c76d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa041c769 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042c775 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043c771 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc153ad80 // fmla za.s[x9, 0], { z12.s-z15.s }, z3.s[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc153ad01 // fmla za.s[x9, 1], { z8.s-z11.s }, z3.s[3]\n"
- ".inst 0xc153ae82 // fmla za.s[x9, 2], { z20.s-z23.s }, z3.s[3]\n"
- ".inst 0xc153ae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z3.s[3]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+ ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15baf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z11.s[3]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
+ ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15bae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[3]\n"
+ "addvl x26, x26, #16\n"
"33:" // Width 4: Multiply loop: multiply skip
"tbz %x[flags], #1, 34f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
"ld1rw { z21.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
"ld1rw { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n"
- ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
- ".inst 0xa061c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x8, MUL VL]\n"
".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"b 35f\n"
"34:" // Width 4: No activation
".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
- ".inst 0xa061c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
".inst 0xa063c324 // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"35:" // Width 4: Output done
- "subs x26, x26, #0x4\n"
+ "subs x27, x27, #0x4\n"
"sub %x[N], %x[N], x28, LSL #2\n"
"bgt 4b\n"
"36:" // Exit
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
index 7ed04da9df..9747587495 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
@@ -64,22 +64,22 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
__asm__ __volatile__(
"ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
- "mov x10, #0x0\n"
- "cntw x9, ALL, MUL #4\n"
- "mov x28, #0x4\n"
- "add x27, %x[N], x9\n"
+ "cntw x10, ALL, MUL #4\n"
+ "add x28, %x[N], x10\n"
+ "sub x28, x28, #0x1\n"
+ "udiv x28, x28, x10\n"
+ "add x22, x28, #0x3\n"
+ "and x22, x22, #0xfffffffffffffffc\n"
+ "mul x22, x22, x10\n"
+ "mul x22, x22, %x[K]\n"
+ "mov x9, #0x0\n"
+ "mov x27, #0x4\n"
"mov x26, %x[B_ptr]\n"
- "sub x27, x27, #0x1\n"
"mov x25, %x[output_ptr]\n"
- "udiv x27, x27, x9\n"
"ptrue p2.b\n"
- "add x22, x27, #0x3\n"
".inst 0x25207811 // ptrue pn9.b\n"
- "and x22, x22, #0xfffffffffffffffc\n"
- "mov x21, #0x1\n"
- "mul x22, x22, x9\n"
- "mul x22, x22, %x[K]\n"
"lsl x22, x22, #0x1\n"
+ "mov x21, #0x1\n"
"1:" // RHS size check loop
"cmp x22, #0x200000\n"
"blt 2f\n"
@@ -97,9 +97,9 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"3:" // RHS prefetch exit
"mov x24, %x[bias]\n"
"4:" // Column loop
- "cmp x27, #0x4\n"
+ "cmp x28, #0x4\n"
"bge 28f\n"
- "cmp x27, #0x2\n"
+ "cmp x28, #0x2\n"
"bgt 20f\n"
"beq 12f\n"
"mov x23, %x[A_ptr]\n"
@@ -110,7 +110,7 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 5f\n"
".inst 0xa040c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0044e00 // mova za.d[x10, #0], { z16.d-z19.d }\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -119,72 +119,72 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"ble 8f\n"
"7:" // Width 1: Multiply loop: Main loop head
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z8.s }, p1/Z, [x23]\n"
- "addvl x26, x26, #16\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z10.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa94a // bfcvt z10.h, p2/M, z10.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z10.h, z10.h, z10.h\n"
"sub x22, x22, #0x8\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- "cmp x22, #0x8\n"
- "add x23, x23, #0x20\n"
- ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
- "addvl x26, x26, #16\n"
- ".inst 0x658aa908 // bfcvt z8.h, p2/M, z8.s\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z10.d, z10.d, z16.d\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"addvl x26, x26, #16\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ ".inst 0xc15ab198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[0]\n"
".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"addvl x26, x26, #16\n"
- "uzp1 z8.h, z8.h, z8.h\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z8.d, z8.d, z11.d\n"
- ".inst 0xc158d098 // bfdot za.s[x10, 0], { z4.h-z7.h }, z8.h[0]\n"
- ".inst 0xc158d698 // bfdot za.s[x10, 0], { z20.h-z23.h }, z8.h[1]\n"
- ".inst 0xc158da18 // bfdot za.s[x10, 0], { z16.h-z19.h }, z8.h[2]\n"
- ".inst 0xc158dd98 // bfdot za.s[x10, 0], { z12.h-z15.h }, z8.h[3]\n"
+ "cmp x22, #0x8\n"
+ ".inst 0xc15ab598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[1]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "addvl x26, x26, #16\n"
+ "add x23, x23, #0x20\n"
+ ".inst 0xc15ab818 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[2]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15abf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z10.h[3]\n"
"bgt 7b\n"
"8:" // Width 1: Multiply loop: Single iteration only
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z3.s }, p1/Z, [x23]\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z17.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa31 // bfcvt z17.h, p2/M, z17.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
"subs x22, x22, #0x2\n"
- "addvl x26, x26, #16\n"
- "ld1rqw { z24.s }, p0/Z, [x23, #16]\n"
+ "uzp1 z17.h, z17.h, z17.h\n"
+ "trn1 z15.d, z15.d, z17.d\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x20\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0x658aab18 // bfcvt z24.h, p2/M, z24.s\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z24.h, z24.h, z24.h\n"
- "trn1 z3.d, z3.d, z24.d\n"
- ".inst 0xc153d198 // bfdot za.s[x10, 0], { z12.h-z15.h }, z3.h[0]\n"
+ ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+ "addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
+ ".inst 0xc15fb418 // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d598 // bfdot za.s[x10, 0], { z12.h-z15.h }, z3.h[1]\n"
"ble 9f\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
+ ".inst 0xc15fb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d998 // bfdot za.s[x10, 0], { z12.h-z15.h }, z3.h[2]\n"
"ble 9f\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153df98 // bfdot za.s[x10, 0], { z28.h-z31.h }, z3.h[3]\n"
"9:" // Width 1: Multiply loop: multiply skip
"tbz %x[flags], #1, 10f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0064c1c // mova { z28.d-z31.d }, za.d[x10, #0]\n"
- "ld1rw { z4.s }, p2/Z, [x21]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc1b2c89c // fclamp { z28.s-z31.s }, z4.s, z18.s\n"
- ".inst 0xa060c33c // st1w { z28.s-z31.s }, p8, [x25]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ "ld1rw { z8.s }, p2/Z, [x21]\n"
+ "ld1rw { z26.s }, p2/Z, [x20]\n"
+ ".inst 0xc1bac900 // fclamp { z0.s-z3.s }, z8.s, z26.s\n"
+ ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"b 11f\n"
"10:" // Width 1: No activation
- ".inst 0xc0064c04 // mova { z4.d-z7.d }, za.d[x10, #0]\n"
+ ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
".inst 0xa060c324 // st1w { z4.s-z7.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"11:" // Width 1: Output done
@@ -192,15 +192,15 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"12:" // Width 2
"mov x23, %x[A_ptr]\n"
"lsl x21, %x[K], #0x2\n"
- "sub x20, %x[N], x9\n"
+ "sub x20, %x[N], x10\n"
"mov x22, %x[K]\n"
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 13f\n"
- ".inst 0xa040c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0044e00 // mova za.d[x10, #0], { z16.d-z19.d }\n"
- ".inst 0xc0044c01 // mova za.d[x10, #1], { z0.d-z3.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
+ ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
"b 14f\n"
"13:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -209,94 +209,94 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"ble 16f\n"
"15:" // Width 2: Multiply loop: Main loop head
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z8.s }, p1/Z, [x23]\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z13.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n"
+ "ld1rqw { z27.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aab7b // bfcvt z27.h, p2/M, z27.s\n"
+ "uzp1 z13.h, z13.h, z13.h\n"
"sub x22, x22, #0x8\n"
- "ld1rqw { z9.s }, p0/Z, [x23, #16]\n"
+ "uzp1 z27.h, z27.h, z27.h\n"
+ "trn1 z13.d, z13.d, z27.d\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
"cmp x22, #0x8\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15db298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[0]\n"
+ "addvl x26, x26, #16\n"
"add x23, x23, #0x20\n"
+ ".inst 0xc15db019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z13.h[0]\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15db698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0x658aa908 // bfcvt z8.h, p2/M, z8.s\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
- ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
+ ".inst 0xc15db719 // bfdot za.s[x9, 1], { z24.h-z27.h }, z13.h[1]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15db918 // bfdot za.s[x9, 0], { z8.h-z11.h }, z13.h[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- "addvl x26, x26, #16\n"
- "uzp1 z8.h, z8.h, z8.h\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- "uzp1 z9.h, z9.h, z9.h\n"
- "trn1 z8.d, z8.d, z9.d\n"
- ".inst 0xc158d298 // bfdot za.s[x10, 0], { z20.h-z23.h }, z8.h[0]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15dba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z13.h[2]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15dbc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z13.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc158d319 // bfdot za.s[x10, 1], { z24.h-z27.h }, z8.h[0]\n"
- ".inst 0xc158d418 // bfdot za.s[x10, 0], { z0.h-z3.h }, z8.h[1]\n"
- ".inst 0xc158d619 // bfdot za.s[x10, 1], { z16.h-z19.h }, z8.h[1]\n"
- ".inst 0xc158d898 // bfdot za.s[x10, 0], { z4.h-z7.h }, z8.h[2]\n"
- ".inst 0xc158d999 // bfdot za.s[x10, 1], { z12.h-z15.h }, z8.h[2]\n"
- ".inst 0xc158df98 // bfdot za.s[x10, 0], { z28.h-z31.h }, z8.h[3]\n"
- ".inst 0xc158de99 // bfdot za.s[x10, 1], { z20.h-z23.h }, z8.h[3]\n"
+ ".inst 0xc15dbc99 // bfdot za.s[x9, 1], { z4.h-z7.h }, z13.h[3]\n"
"bgt 15b\n"
"16:" // Width 2: Multiply loop: Single iteration only
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z3.s }, p1/Z, [x23]\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aa8a5 // bfcvt z5.h, p2/M, z5.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
"subs x22, x22, #0x2\n"
- "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "trn1 z15.d, z15.d, z5.d\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x20\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z16.h, z16.h, z16.h\n"
- "trn1 z3.d, z3.d, z16.d\n"
- ".inst 0xc153d318 // bfdot za.s[x10, 0], { z24.h-z27.h }, z3.h[0]\n"
- ".inst 0xc153d099 // bfdot za.s[x10, 1], { z4.h-z7.h }, z3.h[0]\n"
+ ".inst 0xc15fb319 // bfdot za.s[x9, 1], { z24.h-z27.h }, z15.h[0]\n"
"ble 17f\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb798 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[1]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d498 // bfdot za.s[x10, 0], { z4.h-z7.h }, z3.h[1]\n"
- ".inst 0xc153d719 // bfdot za.s[x10, 1], { z24.h-z27.h }, z3.h[1]\n"
"ble 17f\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d898 // bfdot za.s[x10, 0], { z4.h-z7.h }, z3.h[2]\n"
- ".inst 0xc153d999 // bfdot za.s[x10, 1], { z12.h-z15.h }, z3.h[2]\n"
"ble 17f\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
+ ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fbd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153dc98 // bfdot za.s[x10, 0], { z4.h-z7.h }, z3.h[3]\n"
- ".inst 0xc153de99 // bfdot za.s[x10, 1], { z20.h-z23.h }, z3.h[3]\n"
"17:" // Width 2: Multiply loop: multiply skip
"tbz %x[flags], #1, 18f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0064c08 // mova { z8.d-z11.d }, za.d[x10, #0]\n"
- ".inst 0xc0064c2c // mova { z12.d-z15.d }, za.d[x10, #1]\n"
- "ld1rw { z3.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+ "ld1rw { z11.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- ".inst 0xc1bcc868 // fclamp { z8.s-z11.s }, z3.s, z28.s\n"
- ".inst 0xc1bcc86c // fclamp { z12.s-z15.s }, z3.s, z28.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
+ ".inst 0xc1bcc974 // fclamp { z20.s-z23.s }, z11.s, z28.s\n"
+ ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+ ".inst 0xc1bcc96c // fclamp { z12.s-z15.s }, z11.s, z28.s\n"
".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"b 19f\n"
"18:" // Width 2: No activation
- ".inst 0xc0064c04 // mova { z4.d-z7.d }, za.d[x10, #0]\n"
- ".inst 0xc0064c38 // mova { z24.d-z27.d }, za.d[x10, #1]\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
- ".inst 0xa061c338 // st1w { z24.s-z27.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c320 // st1w { z0.s-z3.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"19:" // Width 2: Output done
"b 36f\n"
@@ -304,17 +304,17 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"mov x20, #0x2\n"
"mov x23, %x[A_ptr]\n"
"lsl x21, %x[K], #0x2\n"
- "msub x20, x9, x20, %x[N]\n"
+ "msub x20, x10, x20, %x[N]\n"
"mov x22, %x[K]\n"
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 21f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0044c00 // mova za.d[x10, #0], { z0.d-z3.d }\n"
- ".inst 0xc0044f01 // mova za.d[x10, #1], { z24.d-z27.d }\n"
- ".inst 0xc0044f82 // mova za.d[x10, #2], { z28.d-z31.d }\n"
+ ".inst 0xa040c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n"
+ ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
"b 22f\n"
"21:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -323,114 +323,114 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"ble 24f\n"
"23:" // Width 3: Multiply loop: Main loop head
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z3.s }, p1/Z, [x23]\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z14.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ce // bfcvt z14.h, p2/M, z14.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z14.h, z14.h, z14.h\n"
"sub x22, x22, #0x8\n"
- "ld1rqw { z0.s }, p0/Z, [x23, #16]\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z14.d, z14.d, z16.d\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
"cmp x22, #0x8\n"
- "add x23, x23, #0x20\n"
".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15eb098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z14.h[0]\n"
+ "add x23, x23, #0x20\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15eb319 // bfdot za.s[x9, 1], { z24.h-z27.h }, z14.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15eb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[0]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15eb518 // bfdot za.s[x9, 0], { z8.h-z11.h }, z14.h[1]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15eb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z14.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15eb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[1]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15eb818 // bfdot za.s[x9, 0], { z0.h-z3.h }, z14.h[2]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ebb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[2]\n"
"addvl x26, x26, #16\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z3.d, z3.d, z0.d\n"
- ".inst 0xc153d198 // bfdot za.s[x10, 0], { z12.h-z15.h }, z3.h[0]\n"
- ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153d319 // bfdot za.s[x10, 1], { z24.h-z27.h }, z3.h[0]\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15eb81a // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[2]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ebf18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z14.h[3]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ebf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d09a // bfdot za.s[x10, 2], { z4.h-z7.h }, z3.h[0]\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc153d618 // bfdot za.s[x10, 0], { z16.h-z19.h }, z3.h[1]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153d799 // bfdot za.s[x10, 1], { z28.h-z31.h }, z3.h[1]\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc153d51a // bfdot za.s[x10, 2], { z8.h-z11.h }, z3.h[1]\n"
- ".inst 0xc153da98 // bfdot za.s[x10, 0], { z20.h-z23.h }, z3.h[2]\n"
- ".inst 0xc153d999 // bfdot za.s[x10, 1], { z12.h-z15.h }, z3.h[2]\n"
- ".inst 0xc153db1a // bfdot za.s[x10, 2], { z24.h-z27.h }, z3.h[2]\n"
- ".inst 0xc153dc98 // bfdot za.s[x10, 0], { z4.h-z7.h }, z3.h[3]\n"
- ".inst 0xc153de19 // bfdot za.s[x10, 1], { z16.h-z19.h }, z3.h[3]\n"
- ".inst 0xc153df9a // bfdot za.s[x10, 2], { z28.h-z31.h }, z3.h[3]\n"
+ ".inst 0xc15ebe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[3]\n"
"bgt 23b\n"
"24:" // Width 3: Multiply loop: Single iteration only
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z3.s }, p1/Z, [x23]\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z31.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
"subs x22, x22, #0x2\n"
- "ld1rqw { z20.s }, p0/Z, [x23, #16]\n"
+ "uzp1 z31.h, z31.h, z31.h\n"
+ "trn1 z15.d, z15.d, z31.d\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x20\n"
- ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+ ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0x658aaa94 // bfcvt z20.h, p2/M, z20.s\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z20.h, z20.h, z20.h\n"
- "trn1 z3.d, z3.d, z20.d\n"
- ".inst 0xc153d098 // bfdot za.s[x10, 0], { z4.h-z7.h }, z3.h[0]\n"
- ".inst 0xc153d199 // bfdot za.s[x10, 1], { z12.h-z15.h }, z3.h[0]\n"
- ".inst 0xc153d31a // bfdot za.s[x10, 2], { z24.h-z27.h }, z3.h[0]\n"
+ ".inst 0xc15fb09a // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[0]\n"
"ble 25f\n"
".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d698 // bfdot za.s[x10, 0], { z20.h-z23.h }, z3.h[1]\n"
- ".inst 0xc153d619 // bfdot za.s[x10, 1], { z16.h-z19.h }, z3.h[1]\n"
- ".inst 0xc153d79a // bfdot za.s[x10, 2], { z28.h-z31.h }, z3.h[1]\n"
"ble 25f\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb819 // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[2]\n"
+ ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbb1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153db98 // bfdot za.s[x10, 0], { z28.h-z31.h }, z3.h[2]\n"
- ".inst 0xc153d919 // bfdot za.s[x10, 1], { z8.h-z11.h }, z3.h[2]\n"
- ".inst 0xc153d99a // bfdot za.s[x10, 2], { z12.h-z15.h }, z3.h[2]\n"
"ble 25f\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
+ ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fbd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
+ ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbc9a // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153dd98 // bfdot za.s[x10, 0], { z12.h-z15.h }, z3.h[3]\n"
- ".inst 0xc153df19 // bfdot za.s[x10, 1], { z24.h-z27.h }, z3.h[3]\n"
- ".inst 0xc153dd1a // bfdot za.s[x10, 2], { z8.h-z11.h }, z3.h[3]\n"
"25:" // Width 3: Multiply loop: multiply skip
"tbz %x[flags], #1, 26f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0064c08 // mova { z8.d-z11.d }, za.d[x10, #0]\n"
- ".inst 0xc0064c20 // mova { z0.d-z3.d }, za.d[x10, #1]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
"ld1rw { z17.s }, p2/Z, [x21]\n"
- ".inst 0xc0064c44 // mova { z4.d-z7.d }, za.d[x10, #2]\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
- ".inst 0xc1b0ca20 // fclamp { z0.s-z3.s }, z17.s, z16.s\n"
+ ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c73c // st1w { z28.s-z31.s }, pn9.b, [x25]\n"
".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xa061c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c324 // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xa061c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"b 27f\n"
"26:" // Width 3: No activation
- ".inst 0xc0064c08 // mova { z8.d-z11.d }, za.d[x10, #0]\n"
- ".inst 0xc0064c20 // mova { z0.d-z3.d }, za.d[x10, #1]\n"
- ".inst 0xc0064c50 // mova { z16.d-z19.d }, za.d[x10, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xa061c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"27:" // Width 3: Output done
@@ -439,20 +439,20 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"mov x20, #0x3\n"
"mov x23, %x[A_ptr]\n"
"lsl x21, %x[K], #0x2\n"
- "msub x20, x9, x20, %x[N]\n"
+ "msub x20, x10, x20, %x[N]\n"
"mov x22, %x[K]\n"
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 29f\n"
".inst 0xa040c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24]\n"
- ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
- ".inst 0xc0044d80 // mova za.d[x10, #0], { z12.d-z15.d }\n"
+ ".inst 0xc0042d80 // mova za.d[x9, #0], { z12.d-z15.d }\n"
+ ".inst 0xa041c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
+ ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
+ ".inst 0xa043c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+ ".inst 0xc0042e83 // mova za.d[x9, #3], { z20.d-z23.d }\n"
"addvl x24, x24, #16\n"
- ".inst 0xc0044c81 // mova za.d[x10, #1], { z4.d-z7.d }\n"
- ".inst 0xc0044e82 // mova za.d[x10, #2], { z20.d-z23.d }\n"
- ".inst 0xc0044e03 // mova za.d[x10, #3], { z16.d-z19.d }\n"
"b 30f\n"
"29:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -461,140 +461,140 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"ble 32f\n"
"31:" // Width 4: Multiply loop: Main loop head
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z7.s }, p1/Z, [x23]\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z6.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa8c6 // bfcvt z6.h, p2/M, z6.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"sub x22, x22, #0x8\n"
- "ld1rqw { z4.s }, p0/Z, [x23, #16]\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z6.d, z6.d, z16.d\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"cmp x22, #0x8\n"
- "add x23, x23, #0x20\n"
- ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0x658aa8e7 // bfcvt z7.h, p2/M, z7.s\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0x658aa884 // bfcvt z4.h, p2/M, z4.s\n"
- ".inst 0xa043a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- "uzp1 z7.h, z7.h, z7.h\n"
- ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- "uzp1 z4.h, z4.h, z4.h\n"
- "trn1 z7.d, z7.d, z4.d\n"
- ".inst 0xc157d218 // bfdot za.s[x10, 0], { z16.h-z19.h }, z7.h[0]\n"
+ ".inst 0xc156b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z6.h[0]\n"
+ "add x23, x23, #0x20\n"
+ ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z6.h[0]\n"
".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156b19a // bfdot za.s[x9, 2], { z12.h-z15.h }, z6.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc157d199 // bfdot za.s[x10, 1], { z12.h-z15.h }, z7.h[0]\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc157d39a // bfdot za.s[x10, 2], { z28.h-z31.h }, z7.h[0]\n"
- ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc157d31b // bfdot za.s[x10, 3], { z24.h-z27.h }, z7.h[0]\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc157d698 // bfdot za.s[x10, 0], { z20.h-z23.h }, z7.h[1]\n"
+ ".inst 0xc156b21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[0]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc156b518 // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[1]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156b599 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n"
".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156b41a // bfdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc157d519 // bfdot za.s[x10, 1], { z8.h-z11.h }, z7.h[1]\n"
+ ".inst 0xc156b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[1]\n"
".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc157d41a // bfdot za.s[x10, 2], { z0.h-z3.h }, z7.h[1]\n"
- ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc157d61b // bfdot za.s[x10, 3], { z16.h-z19.h }, z7.h[1]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc157d998 // bfdot za.s[x10, 0], { z12.h-z15.h }, z7.h[2]\n"
- ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc157db99 // bfdot za.s[x10, 1], { z28.h-z31.h }, z7.h[2]\n"
- ".inst 0xc157db1a // bfdot za.s[x10, 2], { z24.h-z27.h }, z7.h[2]\n"
- ".inst 0xc157da9b // bfdot za.s[x10, 3], { z20.h-z23.h }, z7.h[2]\n"
- ".inst 0xc157dd18 // bfdot za.s[x10, 0], { z8.h-z11.h }, z7.h[3]\n"
- ".inst 0xc157dc19 // bfdot za.s[x10, 1], { z0.h-z3.h }, z7.h[3]\n"
- ".inst 0xc157de1a // bfdot za.s[x10, 2], { z16.h-z19.h }, z7.h[3]\n"
- ".inst 0xc157dd9b // bfdot za.s[x10, 3], { z12.h-z15.h }, z7.h[3]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc156b918 // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[2]\n"
+ ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156b999 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[2]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156b91a // bfdot za.s[x9, 2], { z8.h-z11.h }, z6.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc156ba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[2]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc156bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z6.h[3]\n"
+ ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156bd99 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[3]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156bf1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z6.h[3]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc156be1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[3]\n"
"bgt 31b\n"
"32:" // Width 4: Multiply loop: Single iteration only
"whilelt p1.s, XZR, x22\n"
- "whilelt p0.s, x28, x22\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- "ld1rqw { z3.s }, p1/Z, [x23]\n"
- "subs x22, x22, #0x2\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
"ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
- "add x23, x23, #0x20\n"
- ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
- ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- "addvl x26, x26, #16\n"
- "uzp1 z3.h, z3.h, z3.h\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
+ "subs x22, x22, #0x2\n"
"uzp1 z16.h, z16.h, z16.h\n"
- "trn1 z3.d, z3.d, z16.d\n"
- ".inst 0xc153d318 // bfdot za.s[x10, 0], { z24.h-z27.h }, z3.h[0]\n"
- ".inst 0xc153d399 // bfdot za.s[x10, 1], { z28.h-z31.h }, z3.h[0]\n"
- ".inst 0xc153d11a // bfdot za.s[x10, 2], { z8.h-z11.h }, z3.h[0]\n"
- ".inst 0xc153d19b // bfdot za.s[x10, 3], { z12.h-z15.h }, z3.h[0]\n"
+ "trn1 z15.d, z15.d, z16.d\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ "add x23, x23, #0x20\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb318 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[0]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15fb21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[0]\n"
"ble 33f\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xa043a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153d598 // bfdot za.s[x10, 0], { z12.h-z15.h }, z3.h[1]\n"
+ ".inst 0xc15fb718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[1]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[1]\n"
+ ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb69a // bfdot za.s[x9, 2], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa043a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fb41b // bfdot za.s[x9, 3], { z0.h-z3.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d799 // bfdot za.s[x10, 1], { z28.h-z31.h }, z3.h[1]\n"
- ".inst 0xc153d51a // bfdot za.s[x10, 2], { z8.h-z11.h }, z3.h[1]\n"
- ".inst 0xc153d71b // bfdot za.s[x10, 3], { z24.h-z27.h }, z3.h[1]\n"
"ble 33f\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"subs x22, x22, #0x2\n"
- ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153db18 // bfdot za.s[x10, 0], { z24.h-z27.h }, z3.h[2]\n"
+ ".inst 0xc15fba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153d999 // bfdot za.s[x10, 1], { z12.h-z15.h }, z3.h[2]\n"
- ".inst 0xc153db9a // bfdot za.s[x10, 2], { z28.h-z31.h }, z3.h[2]\n"
- ".inst 0xc153da1b // bfdot za.s[x10, 3], { z16.h-z19.h }, z3.h[2]\n"
"ble 33f\n"
- ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fbe19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[3]\n"
".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153dd18 // bfdot za.s[x10, 0], { z8.h-z11.h }, z3.h[3]\n"
+ ".inst 0xc15fbe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc153df19 // bfdot za.s[x10, 1], { z24.h-z27.h }, z3.h[3]\n"
- ".inst 0xc153de9a // bfdot za.s[x10, 2], { z20.h-z23.h }, z3.h[3]\n"
- ".inst 0xc153de1b // bfdot za.s[x10, 3], { z16.h-z19.h }, z3.h[3]\n"
"33:" // Width 4: Multiply loop: multiply skip
"tbz %x[flags], #1, 34f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0064c04 // mova { z4.d-z7.d }, za.d[x10, #0]\n"
- ".inst 0xc0064c28 // mova { z8.d-z11.d }, za.d[x10, #1]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
"ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0064c4c // mova { z12.d-z15.d }, za.d[x10, #2]\n"
+ ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- ".inst 0xc0064c70 // mova { z16.d-z19.d }, za.d[x10, #3]\n"
- ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
- ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x8, MUL VL]\n"
".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"b 35f\n"
"34:" // Width 4: No activation
- ".inst 0xc0064c00 // mova { z0.d-z3.d }, za.d[x10, #0]\n"
- ".inst 0xc0064c24 // mova { z4.d-z7.d }, za.d[x10, #1]\n"
- ".inst 0xc0064c4c // mova { z12.d-z15.d }, za.d[x10, #2]\n"
- ".inst 0xc0064c78 // mova { z24.d-z27.d }, za.d[x10, #3]\n"
- ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
- ".inst 0xa061c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xa062c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"35:" // Width 4: Output done
- "subs x27, x27, #0x4\n"
- "sub %x[N], %x[N], x9, LSL #2\n"
+ "subs x28, x28, #0x4\n"
+ "sub %x[N], %x[N], x10, LSL #2\n"
"bgt 4b\n"
"36:" // Exit
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
index 1a7cc1e70e..a1c441555e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -52,20 +52,20 @@ void sme2_gemv_s8qa_dot_16VL (
__asm__ __volatile__(
"ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
- "mov x9, #0x0\n"
"cntw x28, ALL, MUL #4\n"
- "mov x27, %x[B_ptr]\n"
- "add x26, %x[N], x28\n"
+ "add x27, %x[N], x28\n"
+ "sub x27, x27, #0x1\n"
+ "udiv x27, x27, x28\n"
+ "add x22, x27, #0x3\n"
+ "and x22, x22, #0xfffffffffffffffc\n"
+ "mul x22, x22, x28\n"
+ "mov x9, #0x0\n"
+ "mov x26, %x[B_ptr]\n"
"mov x25, %x[output_ptr]\n"
- "sub x26, x26, #0x1\n"
"ptrue p2.b\n"
- "udiv x26, x26, x28\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "add x22, x26, #0x3\n"
- "mov x21, #0x1\n"
- "and x22, x22, #0xfffffffffffffffc\n"
- "mul x22, x22, x28\n"
"mul x22, x22, %x[K]\n"
+ "mov x21, #0x1\n"
"1:" // RHS size check loop
"cmp x22, #0x200000\n"
"blt 2f\n"
@@ -79,16 +79,16 @@ void sme2_gemv_s8qa_dot_16VL (
"lsl x21, x21, #0x16\n"
"orr x22, x22, x20\n"
"orr x22, x22, x21\n"
- ".inst 0xf8b64b7a // rprfm pldonce, x22, [x27]\n"
+ ".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n"
"3:" // RHS prefetch exit
"mov x24, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z8.b, #0x1\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"4:" // Column loop
- "cmp x26, #0x4\n"
+ "cmp x27, #0x4\n"
"bge 34f\n"
- "cmp x26, #0x2\n"
+ "cmp x27, #0x2\n"
"bgt 24f\n"
"beq 14f\n"
"mov x23, %x[A_ptr]\n"
@@ -98,8 +98,8 @@ void sme2_gemv_s8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 5f\n"
- ".inst 0xa040c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042d80 // mova za.d[x9, #0], { z12.d-z15.d }\n"
+ ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -108,82 +108,82 @@ void sme2_gemv_s8qa_dot_16VL (
"ble 9f\n"
"7:" // Width 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b020 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b720 // sdot za.s[x9, 0], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xc159baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z9.b[2]\n"
- ".inst 0xc159bfa0 // sdot za.s[x9, 0], { z28.b-z31.b }, z9.b[3]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 8f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"8:" // Width 1: Multiply loop: unique 1: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 7b\n"
"9:" // Width 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b320 // sdot za.s[x9, 0], { z24.b-z27.b }, z9.b[0]\n"
+ ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b420 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[1]\n"
+ ".inst 0xc151b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bba0 // sdot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
+ ".inst 0xc151b920 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159be20 // sdot za.s[x9, 0], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bd20 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"10:" // Width 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"11:" // Width 1: Multiply loop: unique 2: skip row sum
"tbnz %x[flags], #31, 12f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
+ "ld1rw { z26.s }, p2/Z, [x21]\n"
+ "neg z26.s, p2/M, z26.s\n"
"whilelt p0.s, XZR, x20\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z26.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "saddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z17.s, p2/M, z17.s\n"
- "mul z11.s, p2/M, z11.s, z17.s\n"
"12:" // Width 1: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z7.s }, p2/Z, [x22]\n"
- "ld1rw { z30.s }, p2/Z, [x21]\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
- ".inst 0xc1aaac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z10.s\n"
- ".inst 0xc1a6aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a7ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1b7cfcc // sclamp { z12.s-z15.s }, z30.s, z23.s\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ "ld1rw { z30.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a2ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1bece0c // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"uzp1 z19.h, z14.h, z15.h\n"
"uzp1 z12.b, z12.b, z19.b\n"
@@ -199,10 +199,10 @@ void sme2_gemv_s8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 15f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xa041c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xc0042c01 // mova za.d[x9, #1], { z0.d-z3.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042f01 // mova za.d[x9, #1], { z24.d-z27.d }\n"
"b 16f\n"
"15:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -211,111 +211,111 @@ void sme2_gemv_s8qa_dot_16VL (
"ble 19f\n"
"17:" // Width 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa0408371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z9.b[0]\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[0]\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xa0418371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b6a1 // sdot za.s[x9, 1], { z20.b-z23.b }, z9.b[1]\n"
- ".inst 0xc159bba0 // sdot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159b821 // sdot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z9.b[3]\n"
- ".inst 0xc159be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b6a1 // sdot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bca0 // sdot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd21 // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 18f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"18:" // Width 2: Multiply loop: unique 3: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 17b\n"
"19:" // Width 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b020 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- ".inst 0xc159b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[0]\n"
+ ".inst 0xc151b320 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xc159b721 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[1]\n"
+ ".inst 0xc151b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bba0 // sdot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159b821 // sdot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
+ ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159bf21 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[3]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd21 // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"20:" // Width 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 21f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"21:" // Width 2: Multiply loop: unique 4: skip row sum
"tbnz %x[flags], #31, 22f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z1.s }, p2/Z, [x21]\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "saddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z1.s, p2/M, z1.s\n"
- "mul z11.s, p2/M, z11.s, z1.s\n"
"22:" // Width 2: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z2.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z3.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z9.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z9.s }, p2/Z, [x22]\n"
- "ld1rw { z6.s }, p2/Z, [x21]\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
- ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
- ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
- ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
- ".inst 0xc1a3aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a9ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z9.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bdccd4 // sclamp { z20.s-z23.s }, z6.s, z29.s\n"
- ".inst 0xc1bdcccc // sclamp { z12.s-z15.s }, z6.s, z29.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z16.h, z22.h, z23.h\n"
- "uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z24.h, z14.h, z15.h\n"
- "uzp1 z20.b, z20.b, z16.b\n"
- "uzp1 z12.b, z12.b, z24.b\n"
- "st1b { z20.b }, p2, [x25]\n"
- "st1b { z12.b }, p1, [x25, #1, MUL VL]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a5aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1a9ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ ".inst 0xc1a9ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ ".inst 0xc1b5ce18 // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+ ".inst 0xc1b5ce00 // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z9.h, z26.h, z27.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z26.h, z2.h, z3.h\n"
+ "uzp1 z24.b, z24.b, z9.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z0.b, z0.b, z26.b\n"
+ "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
"addvl x25, x25, #2\n"
"23:" // Width 2: Output done
"b 44f\n"
@@ -328,12 +328,12 @@ void sme2_gemv_s8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 25f\n"
- ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
- ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
- ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
- ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
+ ".inst 0xa042c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n"
"b 26f\n"
"25:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -342,136 +342,136 @@ void sme2_gemv_s8qa_dot_16VL (
"ble 29f\n"
"27:" // Width 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b3a0 // sdot za.s[x9, 0], { z28.b-z31.b }, z9.b[0]\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z9.b[0]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z9.b[0]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b720 // sdot za.s[x9, 0], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b7a1 // sdot za.s[x9, 1], { z28.b-z31.b }, z9.b[1]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z9.b[2]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b821 // sdot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bba2 // sdot za.s[x9, 2], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159bf21 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[3]\n"
- ".inst 0xc159be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b6a2 // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b920 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+ ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bca1 // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 28f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"28:" // Width 3: Multiply loop: unique 5: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 27b\n"
"29:" // Width 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa040836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z9.b[0]\n"
- ".inst 0xc159b021 // sdot za.s[x9, 1], { z0.b-z3.b }, z9.b[0]\n"
- ".inst 0xc159b3a2 // sdot za.s[x9, 2], { z28.b-z31.b }, z9.b[0]\n"
+ ".inst 0xc151b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b222 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408365 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b4a0 // sdot za.s[x9, 0], { z4.b-z7.b }, z9.b[1]\n"
- ".inst 0xc159b721 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xc159b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z9.b[1]\n"
+ ".inst 0xc151b720 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bba0 // sdot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159b821 // sdot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bb22 // sdot za.s[x9, 2], { z24.b-z27.b }, z9.b[2]\n"
+ ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151baa2 // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bc20 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[3]\n"
- ".inst 0xc159bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bda2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"30:" // Width 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 31f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"31:" // Width 3: Multiply loop: unique 6: skip row sum
"tbnz %x[flags], #31, 32f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
"ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "saddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z16.s, p2/M, z16.s\n"
- "mul z11.s, p2/M, z11.s, z16.s\n"
"32:" // Width 3: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z2.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
"ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "add x21, %x[qp], %[minval]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z0.s }, p2/Z, [x22]\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- "ld1rw { z20.s }, p2/Z, [x20]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
- ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
- ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z21.h, z30.h, z31.h\n"
+ ".inst 0xc1a3ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a3ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1a3ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc1a0ce08 // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+ ".inst 0xc1a0ce04 // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0xc1a0ce0c // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+ "uzp1 z18.h, z10.h, z11.h\n"
+ "uzp1 z4.h, z4.h, z5.h\n"
+ "uzp1 z17.h, z6.h, z7.h\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z20.h, z14.h, z15.h\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z28.b, z28.b, z21.b\n"
- "uzp1 z12.b, z12.b, z20.b\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
- "st1b { z16.b }, p1, [x25, #2, MUL VL]\n"
+ "uzp1 z16.h, z14.h, z15.h\n"
+ "uzp1 z8.b, z8.b, z18.b\n"
+ "st1b { z8.b }, p2, [x25]\n"
+ "uzp1 z4.b, z4.b, z17.b\n"
+ "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z12.b, z12.b, z16.b\n"
+ "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
"addvl x25, x25, #3\n"
"33:" // Width 3: Output done
"b 44f\n"
@@ -484,15 +484,15 @@ void sme2_gemv_s8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 35f\n"
- ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
- ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
+ ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
".inst 0xa042c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xa043c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- "addvl x24, x24, #16\n"
- ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
- ".inst 0xc0042e83 // mova za.d[x9, #3], { z20.d-z23.d }\n"
+ ".inst 0xa043c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x24, x24, #16\n"
"b 36f\n"
"35:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -501,164 +501,164 @@ void sme2_gemv_s8qa_dot_16VL (
"ble 39f\n"
"37:" // Width 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b020 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b2a1 // sdot za.s[x9, 1], { z20.b-z23.b }, z9.b[0]\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b3a2 // sdot za.s[x9, 2], { z28.b-z31.b }, z9.b[0]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b323 // sdot za.s[x9, 3], { z24.b-z27.b }, z9.b[0]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b420 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b7a1 // sdot za.s[x9, 1], { z28.b-z31.b }, z9.b[1]\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z9.b[1]\n"
- ".inst 0xa0428361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z9.b[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bba1 // sdot za.s[x9, 1], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b822 // sdot za.s[x9, 2], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b9a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z9.b[2]\n"
- ".inst 0xa0428361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z9.b[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159bc22 // sdot za.s[x9, 2], { z0.b-z3.b }, z9.b[3]\n"
- ".inst 0xc159be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b623 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151ba23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bda1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0428359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bf22 // sdot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0438345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151bca3 // sdot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 38f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"38:" // Width 4: Multiply loop: unique 7: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 37b\n"
"39:" // Width 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b020 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b2a1 // sdot za.s[x9, 1], { z20.b-z23.b }, z9.b[0]\n"
- ".inst 0xc159b3a2 // sdot za.s[x9, 2], { z28.b-z31.b }, z9.b[0]\n"
- ".inst 0xc159b223 // sdot za.s[x9, 3], { z16.b-z19.b }, z9.b[0]\n"
+ ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0428349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b122 // sdot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b223 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b420 // sdot za.s[x9, 0], { z0.b-z3.b }, z9.b[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b721 // sdot za.s[x9, 1], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xc159b7a2 // sdot za.s[x9, 2], { z28.b-z31.b }, z9.b[1]\n"
- ".inst 0xc159b623 // sdot za.s[x9, 3], { z16.b-z19.b }, z9.b[1]\n"
+ ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b621 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b5a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0438355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b6a3 // sdot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bb20 // sdot za.s[x9, 0], { z24.b-z27.b }, z9.b[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b821 // sdot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159baa2 // sdot za.s[x9, 2], { z20.b-z23.b }, z9.b[2]\n"
- ".inst 0xc159b9a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z9.b[2]\n"
+ ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151ba23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z9.b[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bfa1 // sdot za.s[x9, 1], { z28.b-z31.b }, z9.b[3]\n"
- ".inst 0xc159bea2 // sdot za.s[x9, 2], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151be20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"40:" // Width 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 41f\n"
- "sdot z11.s, z9.b, z8.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"41:" // Width 4: Multiply loop: unique 8: skip row sum
"tbnz %x[flags], #31, 42f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
"ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "saddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z16.s, p2/M, z16.s\n"
- "mul z11.s, p2/M, z11.s, z16.s\n"
"42:" // Width 4: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z11.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z12.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z13.s }, p2/Z, [x22]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c58 // mova { z24.d-z27.d }, za.d[x9, #2]\n"
- ".inst 0xc0062c74 // mova { z20.d-z23.d }, za.d[x9, #3]\n"
- ".inst 0xc1aaac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1aaac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z10.s\n"
- ".inst 0xc1aaac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
- ".inst 0xc1aaac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
- ".inst 0xc1acaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z12.s\n"
- ".inst 0xc1acaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z12.s\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc1acaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
- ".inst 0xc1adab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z13.s\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
- ".inst 0xc1adab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z13.s\n"
- ".inst 0xc1b2ce3c // sclamp { z28.s-z31.s }, z17.s, z18.s\n"
- ".inst 0xc1b2ce20 // sclamp { z0.s-z3.s }, z17.s, z18.s\n"
- ".inst 0xc1b2ce38 // sclamp { z24.s-z27.s }, z17.s, z18.s\n"
- ".inst 0xc1b2ce34 // sclamp { z20.s-z23.s }, z17.s, z18.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z14.h, z30.h, z31.h\n"
- "uzp1 z0.h, z0.h, z1.h\n"
- "uzp1 z18.h, z2.h, z3.h\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1abac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xc1abac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+ ".inst 0xc1abac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc0062c6c // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+ ".inst 0xc1abac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1rw { z31.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1bfcc78 // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc70 // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z17.h, z26.h, z27.h\n"
+ ".inst 0xc1bfcc74 // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc6c // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z18.h, z18.h, z19.h\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z16.h, z22.h, z23.h\n"
- "uzp1 z28.b, z28.b, z14.b\n"
- "uzp1 z0.b, z0.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "uzp1 z20.b, z20.b, z16.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "st1b { z0.b }, p2, [x25, #1, MUL VL]\n"
- "st1b { z24.b }, p2, [x25, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x25, #3, MUL VL]\n"
+ "uzp1 z17.h, z22.h, z23.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z30.h, z14.h, z15.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z16.b, z16.b, z18.b\n"
+ "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z20.b, z20.b, z17.b\n"
+ "uzp1 z12.b, z12.b, z30.b\n"
+ "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+ "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
"43:" // Width 4: Output done
- "subs x26, x26, #0x4\n"
+ "subs x27, x27, #0x4\n"
"sub %x[N], %x[N], x28, LSL #2\n"
"bgt 4b\n"
"44:" // Exit
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
index 1cbaf00052..9bf699462a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
@@ -52,20 +52,20 @@ void sme2_gemv_u8qa_dot_16VL (
__asm__ __volatile__(
"ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
- "mov x9, #0x0\n"
"cntw x28, ALL, MUL #4\n"
- "mov x27, %x[B_ptr]\n"
- "add x26, %x[N], x28\n"
+ "add x27, %x[N], x28\n"
+ "sub x27, x27, #0x1\n"
+ "udiv x27, x27, x28\n"
+ "add x22, x27, #0x3\n"
+ "and x22, x22, #0xfffffffffffffffc\n"
+ "mul x22, x22, x28\n"
+ "mov x9, #0x0\n"
+ "mov x26, %x[B_ptr]\n"
"mov x25, %x[output_ptr]\n"
- "sub x26, x26, #0x1\n"
"ptrue p2.b\n"
- "udiv x26, x26, x28\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "add x22, x26, #0x3\n"
- "mov x21, #0x1\n"
- "and x22, x22, #0xfffffffffffffffc\n"
- "mul x22, x22, x28\n"
"mul x22, x22, %x[K]\n"
+ "mov x21, #0x1\n"
"1:" // RHS size check loop
"cmp x22, #0x200000\n"
"blt 2f\n"
@@ -79,16 +79,16 @@ void sme2_gemv_u8qa_dot_16VL (
"lsl x21, x21, #0x16\n"
"orr x22, x22, x20\n"
"orr x22, x22, x21\n"
- ".inst 0xf8b64b7a // rprfm pldonce, x22, [x27]\n"
+ ".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n"
"3:" // RHS prefetch exit
"mov x24, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z8.b, #0x1\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"4:" // Column loop
- "cmp x26, #0x4\n"
+ "cmp x27, #0x4\n"
"bge 34f\n"
- "cmp x26, #0x2\n"
+ "cmp x27, #0x2\n"
"bgt 24f\n"
"beq 14f\n"
"mov x23, %x[A_ptr]\n"
@@ -98,8 +98,8 @@ void sme2_gemv_u8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 5f\n"
- ".inst 0xa040c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042d80 // mova za.d[x9, #0], { z12.d-z15.d }\n"
+ ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -108,82 +108,82 @@ void sme2_gemv_u8qa_dot_16VL (
"ble 9f\n"
"7:" // Width 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b030 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b730 // udot za.s[x9, 0], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xc159bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z9.b[2]\n"
- ".inst 0xc159bfb0 // udot za.s[x9, 0], { z28.b-z31.b }, z9.b[3]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 8f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"8:" // Width 1: Multiply loop: unique 1: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 7b\n"
"9:" // Width 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b330 // udot za.s[x9, 0], { z24.b-z27.b }, z9.b[0]\n"
+ ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b430 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[1]\n"
+ ".inst 0xc151b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bbb0 // udot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
+ ".inst 0xc151b930 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159be30 // udot za.s[x9, 0], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bd30 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"10:" // Width 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"11:" // Width 1: Multiply loop: unique 2: skip row sum
"tbnz %x[flags], #31, 12f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
+ "ld1rw { z26.s }, p2/Z, [x21]\n"
+ "neg z26.s, p2/M, z26.s\n"
"whilelt p0.s, XZR, x20\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z26.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "uaddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z17.s, p2/M, z17.s\n"
- "mul z11.s, p2/M, z11.s, z17.s\n"
"12:" // Width 1: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z7.s }, p2/Z, [x22]\n"
- "ld1rw { z30.s }, p2/Z, [x21]\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
- ".inst 0xc1aaac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z10.s\n"
- ".inst 0xc1a6aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a7ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1b7cfcc // sclamp { z12.s-z15.s }, z30.s, z23.s\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ "ld1rw { z30.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a2ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1bece0c // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"uzp1 z19.h, z14.h, z15.h\n"
"uzp1 z12.b, z12.b, z19.b\n"
@@ -199,10 +199,10 @@ void sme2_gemv_u8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 15f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xa041c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xc0042c01 // mova za.d[x9, #1], { z0.d-z3.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042f01 // mova za.d[x9, #1], { z24.d-z27.d }\n"
"b 16f\n"
"15:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -211,111 +211,111 @@ void sme2_gemv_u8qa_dot_16VL (
"ble 19f\n"
"17:" // Width 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa0408371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z9.b[0]\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b331 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[0]\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b630 // udot za.s[x9, 0], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xa0418371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b6b1 // udot za.s[x9, 1], { z20.b-z23.b }, z9.b[1]\n"
- ".inst 0xc159bbb0 // udot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159b831 // udot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z9.b[3]\n"
- ".inst 0xc159be31 // udot za.s[x9, 1], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b331 // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b6b1 // udot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bcb0 // udot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd31 // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 18f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"18:" // Width 2: Multiply loop: unique 3: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 17b\n"
"19:" // Width 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b030 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- ".inst 0xc159b331 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[0]\n"
+ ".inst 0xc151b330 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b630 // udot za.s[x9, 0], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xc159b731 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[1]\n"
+ ".inst 0xc151b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bbb0 // udot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159b831 // udot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
+ ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159bf31 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[3]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd31 // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"20:" // Width 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 21f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"21:" // Width 2: Multiply loop: unique 4: skip row sum
"tbnz %x[flags], #31, 22f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z1.s }, p2/Z, [x21]\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "uaddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z1.s, p2/M, z1.s\n"
- "mul z11.s, p2/M, z11.s, z1.s\n"
"22:" // Width 2: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z2.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z3.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z9.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z9.s }, p2/Z, [x22]\n"
- "ld1rw { z6.s }, p2/Z, [x21]\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
- ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
- ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
- ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
- ".inst 0xc1a3aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a9ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z9.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bdccd4 // sclamp { z20.s-z23.s }, z6.s, z29.s\n"
- ".inst 0xc1bdcccc // sclamp { z12.s-z15.s }, z6.s, z29.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z16.h, z22.h, z23.h\n"
- "uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z24.h, z14.h, z15.h\n"
- "uzp1 z20.b, z20.b, z16.b\n"
- "uzp1 z12.b, z12.b, z24.b\n"
- "st1b { z20.b }, p2, [x25]\n"
- "st1b { z12.b }, p1, [x25, #1, MUL VL]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a5aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1a9ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ ".inst 0xc1a9ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ ".inst 0xc1b5ce18 // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+ ".inst 0xc1b5ce00 // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z9.h, z26.h, z27.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z26.h, z2.h, z3.h\n"
+ "uzp1 z24.b, z24.b, z9.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z0.b, z0.b, z26.b\n"
+ "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
"addvl x25, x25, #2\n"
"23:" // Width 2: Output done
"b 44f\n"
@@ -328,12 +328,12 @@ void sme2_gemv_u8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 25f\n"
- ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
- ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xa042c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
- ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
- ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
+ ".inst 0xa042c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n"
"b 26f\n"
"25:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -342,136 +342,136 @@ void sme2_gemv_u8qa_dot_16VL (
"ble 29f\n"
"27:" // Width 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b3b0 // udot za.s[x9, 0], { z28.b-z31.b }, z9.b[0]\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b231 // udot za.s[x9, 1], { z16.b-z19.b }, z9.b[0]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z9.b[0]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xa040836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b730 // udot za.s[x9, 0], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b7b1 // udot za.s[x9, 1], { z28.b-z31.b }, z9.b[1]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b632 // udot za.s[x9, 2], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z9.b[2]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b831 // udot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bbb2 // udot za.s[x9, 2], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159bf31 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[3]\n"
- ".inst 0xc159be32 // udot za.s[x9, 2], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b230 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b6b2 // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b930 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+ ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bcb1 // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151be32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 28f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"28:" // Width 3: Multiply loop: unique 5: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 27b\n"
"29:" // Width 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa040836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z9.b[0]\n"
- ".inst 0xc159b031 // udot za.s[x9, 1], { z0.b-z3.b }, z9.b[0]\n"
- ".inst 0xc159b3b2 // udot za.s[x9, 2], { z28.b-z31.b }, z9.b[0]\n"
+ ".inst 0xc151b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b232 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408365 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b4b0 // udot za.s[x9, 0], { z4.b-z7.b }, z9.b[1]\n"
- ".inst 0xc159b731 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xc159b632 // udot za.s[x9, 2], { z16.b-z19.b }, z9.b[1]\n"
+ ".inst 0xc151b730 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b632 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa040837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bbb0 // udot za.s[x9, 0], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xc159b831 // udot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bb32 // udot za.s[x9, 2], { z24.b-z27.b }, z9.b[2]\n"
+ ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bab2 // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bc30 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[3]\n"
- ".inst 0xc159beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159be32 // udot za.s[x9, 2], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bdb2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"30:" // Width 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 31f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"31:" // Width 3: Multiply loop: unique 6: skip row sum
"tbnz %x[flags], #31, 32f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
"ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "uaddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z16.s, p2/M, z16.s\n"
- "mul z11.s, p2/M, z11.s, z16.s\n"
"32:" // Width 3: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z2.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
- "add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
"ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "add x21, %x[qp], %[minval]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z0.s }, p2/Z, [x22]\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- "ld1rw { z20.s }, p2/Z, [x20]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
- ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
- ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z21.h, z30.h, z31.h\n"
+ ".inst 0xc1a3ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a3ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1a3ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc1a0ce08 // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+ ".inst 0xc1a0ce04 // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0xc1a0ce0c // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+ "uzp1 z18.h, z10.h, z11.h\n"
+ "uzp1 z4.h, z4.h, z5.h\n"
+ "uzp1 z17.h, z6.h, z7.h\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z20.h, z14.h, z15.h\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z28.b, z28.b, z21.b\n"
- "uzp1 z12.b, z12.b, z20.b\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
- "st1b { z16.b }, p1, [x25, #2, MUL VL]\n"
+ "uzp1 z16.h, z14.h, z15.h\n"
+ "uzp1 z8.b, z8.b, z18.b\n"
+ "st1b { z8.b }, p2, [x25]\n"
+ "uzp1 z4.b, z4.b, z17.b\n"
+ "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z12.b, z12.b, z16.b\n"
+ "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
"addvl x25, x25, #3\n"
"33:" // Width 3: Output done
"b 44f\n"
@@ -484,15 +484,15 @@ void sme2_gemv_u8qa_dot_16VL (
".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 35f\n"
- ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
- ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
+ ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
".inst 0xa042c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xa043c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- "addvl x24, x24, #16\n"
- ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
- ".inst 0xc0042e83 // mova za.d[x9, #3], { z20.d-z23.d }\n"
+ ".inst 0xa043c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x24, x24, #16\n"
"b 36f\n"
"35:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
@@ -501,164 +501,164 @@ void sme2_gemv_u8qa_dot_16VL (
"ble 39f\n"
"37:" // Width 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b030 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b2b1 // udot za.s[x9, 1], { z20.b-z23.b }, z9.b[0]\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b3b2 // udot za.s[x9, 2], { z28.b-z31.b }, z9.b[0]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b333 // udot za.s[x9, 3], { z24.b-z27.b }, z9.b[0]\n"
- ".inst 0xa0428371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b430 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b7b1 // udot za.s[x9, 1], { z28.b-z31.b }, z9.b[1]\n"
- ".inst 0xa0408375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b632 // udot za.s[x9, 2], { z16.b-z19.b }, z9.b[1]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z9.b[1]\n"
- ".inst 0xa0428361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z9.b[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bbb1 // udot za.s[x9, 1], { z28.b-z31.b }, z9.b[2]\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- ".inst 0xc159b832 // udot za.s[x9, 2], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xc159b9b3 // udot za.s[x9, 3], { z12.b-z15.b }, z9.b[2]\n"
- ".inst 0xa0428361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z9.b[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159bc32 // udot za.s[x9, 2], { z0.b-z3.b }, z9.b[3]\n"
- ".inst 0xc159be33 // udot za.s[x9, 3], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b230 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b632 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b633 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151ba33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bdb1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0428359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bf32 // udot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0438345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151bcb3 // udot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 38f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"38:" // Width 4: Multiply loop: unique 7: skip row sum
"sub x22, x22, #0x10\n"
"cmp x22, #0x10\n"
"bgt 37b\n"
"39:" // Width 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x22\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x22, x22, #0x4\n"
- "ld1rqb { z9.b }, p0/Z, [x23]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0418375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b030 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[0]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b2b1 // udot za.s[x9, 1], { z20.b-z23.b }, z9.b[0]\n"
- ".inst 0xc159b3b2 // udot za.s[x9, 2], { z28.b-z31.b }, z9.b[0]\n"
- ".inst 0xc159b233 // udot za.s[x9, 3], { z16.b-z19.b }, z9.b[0]\n"
+ ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b331 // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0428349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b132 // udot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b233 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
+ "addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa042837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159b430 // udot za.s[x9, 0], { z0.b-z3.b }, z9.b[1]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b731 // udot za.s[x9, 1], { z24.b-z27.b }, z9.b[1]\n"
- ".inst 0xc159b7b2 // udot za.s[x9, 2], { z28.b-z31.b }, z9.b[1]\n"
- ".inst 0xc159b633 // udot za.s[x9, 3], { z16.b-z19.b }, z9.b[1]\n"
+ ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b631 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b5b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0438355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b6b3 // udot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
+ "addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
"subs x22, x22, #0x4\n"
- ".inst 0xa0418361 // ldnt1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa043836d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bb30 // udot za.s[x9, 0], { z24.b-z27.b }, z9.b[2]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159b831 // udot za.s[x9, 1], { z0.b-z3.b }, z9.b[2]\n"
- ".inst 0xc159bab2 // udot za.s[x9, 2], { z20.b-z23.b }, z9.b[2]\n"
- ".inst 0xc159b9b3 // udot za.s[x9, 3], { z12.b-z15.b }, z9.b[2]\n"
+ ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151ba33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408379 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa041837d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0428375 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- ".inst 0xa0438371 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
- ".inst 0xc159bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z9.b[3]\n"
- "addvl x27, x27, #16\n"
- ".inst 0xc159bfb1 // udot za.s[x9, 1], { z28.b-z31.b }, z9.b[3]\n"
- ".inst 0xc159beb2 // udot za.s[x9, 2], { z20.b-z23.b }, z9.b[3]\n"
- ".inst 0xc159be33 // udot za.s[x9, 3], { z16.b-z19.b }, z9.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151be30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151be32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151be33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
+ "addvl x26, x26, #16\n"
"40:" // Width 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 41f\n"
- "udot z11.s, z9.b, z8.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"41:" // Width 4: Multiply loop: unique 8: skip row sum
"tbnz %x[flags], #31, 42f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
"ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "uaddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "neg z16.s, p2/M, z16.s\n"
- "mul z11.s, p2/M, z11.s, z16.s\n"
"42:" // Width 4: skip row sum fixup
- ".inst 0xc0904960 // addha za0.s, p2/M, p2/M, z11.s\n"
- "add x21, %x[qp], %[per_layer_mul]\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z11.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904961 // addha za1.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "add x22, %x[qp], %[c_offset]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904962 // addha za2.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z12.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904963 // addha za3.s, p2/M, p2/M, z11.s\n"
- "ld1rw { z13.s }, p2/Z, [x22]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
- ".inst 0xc0062c58 // mova { z24.d-z27.d }, za.d[x9, #2]\n"
- ".inst 0xc0062c74 // mova { z20.d-z23.d }, za.d[x9, #3]\n"
- ".inst 0xc1aaac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1aaac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z10.s\n"
- ".inst 0xc1aaac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
- ".inst 0xc1aaac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
- ".inst 0xc1acaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z12.s\n"
- ".inst 0xc1acaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z12.s\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc1acaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
- ".inst 0xc1adab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z13.s\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
- ".inst 0xc1adab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z13.s\n"
- ".inst 0xc1b2ce3c // sclamp { z28.s-z31.s }, z17.s, z18.s\n"
- ".inst 0xc1b2ce20 // sclamp { z0.s-z3.s }, z17.s, z18.s\n"
- ".inst 0xc1b2ce38 // sclamp { z24.s-z27.s }, z17.s, z18.s\n"
- ".inst 0xc1b2ce34 // sclamp { z20.s-z23.s }, z17.s, z18.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z14.h, z30.h, z31.h\n"
- "uzp1 z0.h, z0.h, z1.h\n"
- "uzp1 z18.h, z2.h, z3.h\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1abac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xc1abac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+ ".inst 0xc1abac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc0062c6c // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+ ".inst 0xc1abac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1rw { z31.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1bfcc78 // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc70 // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z17.h, z26.h, z27.h\n"
+ ".inst 0xc1bfcc74 // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc6c // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z18.h, z18.h, z19.h\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z16.h, z22.h, z23.h\n"
- "uzp1 z28.b, z28.b, z14.b\n"
- "uzp1 z0.b, z0.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "uzp1 z20.b, z20.b, z16.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "st1b { z0.b }, p2, [x25, #1, MUL VL]\n"
- "st1b { z24.b }, p2, [x25, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x25, #3, MUL VL]\n"
+ "uzp1 z17.h, z22.h, z23.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z30.h, z14.h, z15.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z16.b, z16.b, z18.b\n"
+ "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z20.b, z20.b, z17.b\n"
+ "uzp1 z12.b, z12.b, z30.b\n"
+ "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+ "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
"43:" // Width 4: Output done
- "subs x26, x26, #0x4\n"
+ "subs x27, x27, #0x4\n"
"sub %x[N], %x[N], x28, LSL #2\n"
"bgt 4b\n"
"44:" // Exit
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
index db4f25bbfa..fcce7a1424 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
class cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 2> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
index 87ba6d4819..36114c5060 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
C(C), ldcb(ldc * sizeof(float)),
M(M), N(N), K(K),
+ n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
min(-std::numeric_limits<float>::infinity()),
max(std::numeric_limits<float>::infinity()),
bias(bias),
@@ -87,13 +88,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
const long kstride_bytes;
float *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();
const float *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -112,17 +112,17 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- "addvl x14, x14, #16\n"
".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x14, x14, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w11, [%x[args], %[offsetof_M]]\n"
@@ -137,103 +137,103 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z15.s, #1.0\n"
- ".inst 0xa109c280 // ld1w { z0.s, z4.s, z8.s, z12.s }, p8/Z, [x20, x9, LSL #2]\n"
- ".inst 0x808001e0 // fmopa za0.s, p0/M, p0/M, z15.s, z0.s\n"
- ".inst 0x808401e1 // fmopa za1.s, p0/M, p0/M, z15.s, z4.s\n"
- ".inst 0x808801e2 // fmopa za2.s, p0/M, p0/M, z15.s, z8.s\n"
- ".inst 0x808c01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z12.s\n"
+ "fmov z6.s, #1.0\n"
+ ".inst 0xa009c29d // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
+ ".inst 0x809c00c0 // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+ ".inst 0x809d00c1 // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+ ".inst 0x809e00c2 // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+ ".inst 0x809f00c3 // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x9\n"
"mov x21, x10\n"
"incw x20, ALL, MUL #4\n"
"incw x21\n"
"cmp x20, x28\n"
- "mov x20, x15\n"
"csel x21, x10, x21, LT\n"
+ "mov x20, x15\n"
"bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
"cmp x21, x11\n"
"csel x15, x20, x15, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x1\n"
"lsr x20, x20, #0x1\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x9, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- "ld1h { z20.h }, p0/Z, [x26]\n"
- ".inst 0xa140a6f3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
- "ld1h { z4.h }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa041a6ec // ld1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1h { z29.h }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142a6f2 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1h { z2.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x9, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ "ld1h { z28.h }, p0/Z, [x26]\n"
+ ".inst 0xa040a6e9 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+ "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042a6e5 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa043a6e8 // ld1h { z8.h-z11.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
+ ".inst 0x81880380 // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0x81890381 // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+ ".inst 0x818a0382 // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+ ".inst 0x818b0383 // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+ "ld1h { z28.h }, p0/Z, [x26]\n"
+ ".inst 0x818c02c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+ ".inst 0xa040a6e9 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+ ".inst 0x818d02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+ ".inst 0x818e02c2 // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+ ".inst 0x818f02c3 // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+ "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x818403c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+ ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x818503c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+ ".inst 0x818603c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+ ".inst 0x818703c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+ "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042a6e5 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
".inst 0x81930280 // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
- "subs x21, x21, #0x1\n"
".inst 0x81970281 // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
".inst 0x819b0282 // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
".inst 0x819f0283 // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
- "ld1h { z20.h }, p0/Z, [x26]\n"
- ".inst 0x818c0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z12.h\n"
- ".inst 0xa140a6f3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
- ".inst 0x818d0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z13.h\n"
- ".inst 0x818e0082 // bfmopa za2.s, p0/M, p0/M, z4.h, z14.h\n"
- ".inst 0x818f0083 // bfmopa za3.s, p0/M, p0/M, z4.h, z15.h\n"
- "ld1h { z4.h }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0x819203a0 // bfmopa za0.s, p0/M, p0/M, z29.h, z18.h\n"
- ".inst 0xa041a6ec // ld1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x819603a1 // bfmopa za1.s, p0/M, p0/M, z29.h, z22.h\n"
- ".inst 0x819a03a2 // bfmopa za2.s, p0/M, p0/M, z29.h, z26.h\n"
- ".inst 0x819e03a3 // bfmopa za3.s, p0/M, p0/M, z29.h, z30.h\n"
- "ld1h { z29.h }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142a6f2 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0x81880040 // bfmopa za0.s, p0/M, p0/M, z2.h, z8.h\n"
- ".inst 0x81890041 // bfmopa za1.s, p0/M, p0/M, z2.h, z9.h\n"
- ".inst 0x818a0042 // bfmopa za2.s, p0/M, p0/M, z2.h, z10.h\n"
- ".inst 0x818b0043 // bfmopa za3.s, p0/M, p0/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa043a6e8 // ld1h { z8.h-z11.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
+ ".inst 0x81880380 // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
+ ".inst 0x81890381 // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+ ".inst 0x818a0382 // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+ ".inst 0x818b0383 // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+ ".inst 0x818c02c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+ ".inst 0x818d02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+ ".inst 0x818e02c2 // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+ ".inst 0x818f02c3 // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+ ".inst 0x818403c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+ ".inst 0x818503c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+ ".inst 0x818603c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+ ".inst 0x818703c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
".inst 0x81930280 // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
".inst 0x81970281 // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
".inst 0x819b0282 // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
".inst 0x819f0283 // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
- ".inst 0x818c0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z12.h\n"
- ".inst 0x818d0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z13.h\n"
- ".inst 0x818e0082 // bfmopa za2.s, p0/M, p0/M, z4.h, z14.h\n"
- ".inst 0x818f0083 // bfmopa za3.s, p0/M, p0/M, z4.h, z15.h\n"
- ".inst 0x819203a0 // bfmopa za0.s, p0/M, p0/M, z29.h, z18.h\n"
- ".inst 0x819603a1 // bfmopa za1.s, p0/M, p0/M, z29.h, z22.h\n"
- ".inst 0x819a03a2 // bfmopa za2.s, p0/M, p0/M, z29.h, z26.h\n"
- ".inst 0x819e03a3 // bfmopa za3.s, p0/M, p0/M, z29.h, z30.h\n"
- ".inst 0x81880040 // bfmopa za0.s, p0/M, p0/M, z2.h, z8.h\n"
- ".inst 0x81890041 // bfmopa za1.s, p0/M, p0/M, z2.h, z9.h\n"
- ".inst 0x818a0042 // bfmopa za2.s, p0/M, p0/M, z2.h, z10.h\n"
- ".inst 0x818b0043 // bfmopa za3.s, p0/M, p0/M, z2.h, z11.h\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1h { z26.h }, p0/Z, [x26]\n"
- "subs x20, x20, #0x1\n"
+ "ld1h { z8.h }, p0/Z, [x26]\n"
+ "subs x21, x21, #0x1\n"
"addvl x26, x26, #1\n"
".inst 0xa140a6e3 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0x81830340 // bfmopa za0.s, p0/M, p0/M, z26.h, z3.h\n"
- ".inst 0x81870341 // bfmopa za1.s, p0/M, p0/M, z26.h, z7.h\n"
- ".inst 0x818b0342 // bfmopa za2.s, p0/M, p0/M, z26.h, z11.h\n"
- ".inst 0x818f0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z15.h\n"
+ ".inst 0x81830100 // bfmopa za0.s, p0/M, p0/M, z8.h, z3.h\n"
+ ".inst 0x81870101 // bfmopa za1.s, p0/M, p0/M, z8.h, z7.h\n"
+ ".inst 0x818b0102 // bfmopa za2.s, p0/M, p0/M, z8.h, z11.h\n"
+ ".inst 0x818f0103 // bfmopa za3.s, p0/M, p0/M, z8.h, z15.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x15, #1, 14f\n"
@@ -241,25 +241,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xa040c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xa041c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa061c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -267,31 +267,31 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"cmp x12, x20\n"
".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 13b\n"
"b 24f\n"
"14:" // Store to output array
"ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
"sub x24, x11, x10\n"
"ldr x23, [%x[args], %[offsetof_ldcb]]\n"
- "add x25, x25, x9, LSL #2\n" // C += n
"madd x25, x10, x23, x25\n" // C += m * ldc
"tbz x15, #2, 18f\n"
"cntw x20\n"
- "mov x12, #0x0\n"
"cmp x24, x20\n"
"csel x22, x24, x20, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
@@ -301,30 +301,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
"add x25, x25, x23\n"
- "add x12, x12, #0x4\n"
".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
"add x25, x25, x23\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
"add x25, x25, x23\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa160c323 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x24, x24, x22\n"
@@ -332,29 +332,29 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"b 22f\n"
"18:" // Store to output array: Skip activation: End
"cntw x20\n"
- "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "mov x12, #0x0\n"
"cmp x24, x20\n"
- "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x20, x24, x20, LT\n"
"lsr x21, x20, #0x2\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 0 loop
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
- "add x12, x12, #0x4\n"
- "cmp x12, x21, LSL #2\n"
".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
".inst 0xa160c333 // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
@@ -364,13 +364,13 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"cbz x20, 21f\n"
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ "subs x20, x20, #0x1\n"
".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 21f\n"
@@ -385,25 +385,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"23:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14]\n"
- ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x14, x14, #16\n"
"blt 23b\n"
"24:" // End block
"incw x9, ALL, MUL #4\n"
"cmp x9, x28\n"
"blt 3b\n"
"incw x10\n"
- "mov x9, #0x0\n"
"cmp x10, x11\n"
+ "mov x9, #0x0\n"
"mov x27, x26\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
index 7f681b2734..8b0f5b013f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
class cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 2> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
index 8b3c6d7fec..cc44c9a537 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
C(C), ldcb(ldc * sizeof(float)),
M(M), N(N), K(K),
+ n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
min(-std::numeric_limits<float>::infinity()),
max(std::numeric_limits<float>::infinity()),
bias(bias),
@@ -87,13 +88,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
const long kstride_bytes;
float *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();
const float *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -112,17 +112,17 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -137,103 +137,103 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z17.s, #1.0\n"
- ".inst 0xa00a428a // ld1w { z10.s-z11.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0x808a0220 // fmopa za0.s, p0/M, p0/M, z17.s, z10.s\n"
- ".inst 0x808b0221 // fmopa za1.s, p0/M, p0/M, z17.s, z11.s\n"
- ".inst 0x808a0222 // fmopa za2.s, p0/M, p0/M, z17.s, z10.s\n"
- ".inst 0x808b0223 // fmopa za3.s, p0/M, p0/M, z17.s, z11.s\n"
+ "fmov z12.s, #1.0\n"
+ ".inst 0xa10a4289 // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x80810180 // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890181 // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+ ".inst 0x80810182 // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890183 // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20, ALL, MUL #2\n"
"incw x21, ALL, MUL #2\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x1\n"
"lsr x20, x20, #0x1\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa0402776 // ld1h { z22.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa14026e7 // ld1h { z7.h, z15.h }, pn9.b/Z, [x23]\n"
- ".inst 0xa1412766 // ld1h { z6.h, z14.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04126f4 // ld1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa1422762 // ld1h { z2.h, z10.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14226e3 // ld1h { z3.h, z11.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1432761 // ld1h { z1.h, z9.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0402772 // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa04026e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0412764 // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04126fb // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa042276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04226f5 // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0432766 // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04326e4 // ld1h { z4.h-z5.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04326e9 // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x818702c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z7.h\n"
- "subs x21, x21, #0x1\n"
- ".inst 0x818f02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z15.h\n"
- ".inst 0x818702e2 // bfmopa za2.s, p0/M, p0/M, z23.h, z7.h\n"
- ".inst 0x818f02e3 // bfmopa za3.s, p0/M, p0/M, z23.h, z15.h\n"
- ".inst 0xa0402776 // ld1h { z22.h-z23.h }, pn9.b/Z, [x27]\n"
- ".inst 0x819400c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z20.h\n"
- ".inst 0xa14026e7 // ld1h { z7.h, z15.h }, pn9.b/Z, [x23]\n"
- ".inst 0x819500c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z21.h\n"
- ".inst 0x819401c2 // bfmopa za2.s, p0/M, p0/M, z14.h, z20.h\n"
- ".inst 0x819501c3 // bfmopa za3.s, p0/M, p0/M, z14.h, z21.h\n"
- ".inst 0xa1412766 // ld1h { z6.h, z14.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0x81830040 // bfmopa za0.s, p0/M, p0/M, z2.h, z3.h\n"
- ".inst 0xa04126f4 // ld1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0x818b0041 // bfmopa za1.s, p0/M, p0/M, z2.h, z11.h\n"
- ".inst 0x81830142 // bfmopa za2.s, p0/M, p0/M, z10.h, z3.h\n"
- ".inst 0x818b0143 // bfmopa za3.s, p0/M, p0/M, z10.h, z11.h\n"
- ".inst 0xa1422762 // ld1h { z2.h, z10.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14226e3 // ld1h { z3.h, z11.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x81840020 // bfmopa za0.s, p0/M, p0/M, z1.h, z4.h\n"
- ".inst 0x81850021 // bfmopa za1.s, p0/M, p0/M, z1.h, z5.h\n"
- ".inst 0x81840122 // bfmopa za2.s, p0/M, p0/M, z9.h, z4.h\n"
- ".inst 0x81850123 // bfmopa za3.s, p0/M, p0/M, z9.h, z5.h\n"
- ".inst 0xa1432761 // ld1h { z1.h, z9.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0x81820240 // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0x81830241 // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+ ".inst 0x81820262 // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+ ".inst 0x81830263 // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+ ".inst 0xa0402772 // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+ ".inst 0x819a0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+ ".inst 0xa04026e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+ ".inst 0x819b0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+ ".inst 0x819a00a2 // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+ ".inst 0x819b00a3 // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+ ".inst 0xa0412764 // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0x81940140 // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+ ".inst 0xa04126fb // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0x81950141 // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+ ".inst 0x81940162 // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+ ".inst 0x81950163 // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+ ".inst 0xa042276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04226f5 // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x818800c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+ ".inst 0x818900c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+ ".inst 0x818800e2 // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+ ".inst 0x818900e3 // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
+ ".inst 0xa0432766 // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04326e4 // ld1h { z4.h-z5.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04326e9 // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x818702c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z7.h\n"
- ".inst 0x818f02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z15.h\n"
- ".inst 0x818702e2 // bfmopa za2.s, p0/M, p0/M, z23.h, z7.h\n"
- ".inst 0x818f02e3 // bfmopa za3.s, p0/M, p0/M, z23.h, z15.h\n"
- ".inst 0x819400c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z20.h\n"
- ".inst 0x819500c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z21.h\n"
- ".inst 0x819401c2 // bfmopa za2.s, p0/M, p0/M, z14.h, z20.h\n"
- ".inst 0x819501c3 // bfmopa za3.s, p0/M, p0/M, z14.h, z21.h\n"
- ".inst 0x81830040 // bfmopa za0.s, p0/M, p0/M, z2.h, z3.h\n"
- ".inst 0x818b0041 // bfmopa za1.s, p0/M, p0/M, z2.h, z11.h\n"
- ".inst 0x81830142 // bfmopa za2.s, p0/M, p0/M, z10.h, z3.h\n"
- ".inst 0x818b0143 // bfmopa za3.s, p0/M, p0/M, z10.h, z11.h\n"
- ".inst 0x81840020 // bfmopa za0.s, p0/M, p0/M, z1.h, z4.h\n"
- ".inst 0x81850021 // bfmopa za1.s, p0/M, p0/M, z1.h, z5.h\n"
- ".inst 0x81840122 // bfmopa za2.s, p0/M, p0/M, z9.h, z4.h\n"
- ".inst 0x81850123 // bfmopa za3.s, p0/M, p0/M, z9.h, z5.h\n"
+ ".inst 0x81820240 // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
+ ".inst 0x81830241 // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+ ".inst 0x81820262 // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+ ".inst 0x81830263 // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+ ".inst 0x819a0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+ ".inst 0x819b0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+ ".inst 0x819a00a2 // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+ ".inst 0x819b00a3 // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+ ".inst 0x81940140 // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+ ".inst 0x81950141 // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+ ".inst 0x81940162 // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+ ".inst 0x81950163 // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+ ".inst 0x818800c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+ ".inst 0x818900c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+ ".inst 0x818800e2 // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+ ".inst 0x818900e3 // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa040277e // ld1h { z30.h-z31.h }, pn9.b/Z, [x27]\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
- ".inst 0xa04026ee // ld1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
+ ".inst 0xa14026e5 // ld1h { z5.h, z13.h }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0x818e0140 // bfmopa za0.s, p0/M, p0/M, z10.h, z14.h\n"
- ".inst 0x818f0141 // bfmopa za1.s, p0/M, p0/M, z10.h, z15.h\n"
- ".inst 0x818e0162 // bfmopa za2.s, p0/M, p0/M, z11.h, z14.h\n"
- ".inst 0x818f0163 // bfmopa za3.s, p0/M, p0/M, z11.h, z15.h\n"
+ ".inst 0x818503c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z5.h\n"
+ ".inst 0x818d03c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z13.h\n"
+ ".inst 0x818503e2 // bfmopa za2.s, p0/M, p0/M, z31.h, z5.h\n"
+ ".inst 0x818d03e3 // bfmopa za3.s, p0/M, p0/M, z31.h, z13.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -241,24 +241,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa060c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa061c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
@@ -267,31 +267,31 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10, LSL #2\n" // C += n
"sub x25, x13, x11\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
- "add x26, x26, x10, LSL #2\n" // C += n
"madd x26, x11, x24, x26\n" // C += m * ldc
"tbz x16, #2, 21f\n"
"cntw x23\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
@@ -299,36 +299,36 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "add x12, x12, #0x4\n"
".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 21f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Skip activation: Accumulator row 1 loop
@@ -336,28 +336,28 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "add x12, x12, #0x4\n"
".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 18b\n"
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -365,37 +365,37 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"b 28f\n"
"21:" // Store to output array: Skip activation: End
"cntw x23\n"
- "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
- "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
- "add x12, x12, #0x4\n"
- "cmp x12, x21, LSL #2\n"
- ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
+ ".inst 0xa1604357 // st1w { z23.s, z31.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 24f\n"
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
@@ -409,34 +409,34 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"subs x25, x25, x22\n"
"beq 28f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 26f\n"
"25:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
- "add x12, x12, #0x4\n"
- "cmp x12, x21, LSL #2\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604343 // st1w { z3.s, z11.s }, p8, [x26]\n"
+ ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 25b\n"
"26:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 27f\n"
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
@@ -451,25 +451,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 29b\n"
"30:" // End block
"incw x10, ALL, MUL #2\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #2\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
index 3c1dff268f..f8812a1a71 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
class cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 2> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
index b4b94f305e..a63cadc63a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
C(C), ldcb(ldc * sizeof(float)),
M(M), N(N), K(K),
+ n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
min(-std::numeric_limits<float>::infinity()),
max(std::numeric_limits<float>::infinity()),
bias(bias),
@@ -87,13 +88,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
const long kstride_bytes;
float *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();
const float *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -112,17 +112,17 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -137,103 +137,103 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z6.s, #1.0\n"
- "ld1w { z26.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0x809a24c0 // fmopa za0.s, p1/M, p1/M, z6.s, z26.s\n"
- ".inst 0x809a24c1 // fmopa za1.s, p1/M, p1/M, z6.s, z26.s\n"
- ".inst 0x809a24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z26.s\n"
- ".inst 0x809a24c3 // fmopa za3.s, p1/M, p1/M, z6.s, z26.s\n"
+ "fmov z11.s, #1.0\n"
+ "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x808d2560 // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2561 // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2563 // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20\n"
"incw x21, ALL, MUL #4\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x1\n"
"lsr x20, x20, #0x1\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa140a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27]\n"
- "ld1h { z13.h }, p1/Z, [x23]\n"
- ".inst 0xa141a372 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ld1h { z21.h }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa142a373 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa143a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa140a360 // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+ "ldnt1h { z19.h }, p1/Z, [x23]\n"
+ ".inst 0xa141a371 // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa142a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa143a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x818d2460 // bfmopa za0.s, p1/M, p1/M, z3.h, z13.h\n"
- "subs x21, x21, #0x1\n"
- ".inst 0x818d24e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z13.h\n"
- ".inst 0x818d2562 // bfmopa za2.s, p1/M, p1/M, z11.h, z13.h\n"
- ".inst 0x818d25e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z13.h\n"
- ".inst 0xa140a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27]\n"
- ".inst 0x81952640 // bfmopa za0.s, p1/M, p1/M, z18.h, z21.h\n"
- "ld1h { z13.h }, p1/Z, [x23]\n"
- ".inst 0x819526c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z21.h\n"
- ".inst 0x81952742 // bfmopa za2.s, p1/M, p1/M, z26.h, z21.h\n"
- ".inst 0x819527c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z21.h\n"
- ".inst 0xa141a372 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0x81912660 // bfmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
- "ld1h { z21.h }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0x819126e1 // bfmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
- ".inst 0x81912762 // bfmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
- ".inst 0x819127e3 // bfmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
- ".inst 0xa142a373 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0x81822600 // bfmopa za0.s, p1/M, p1/M, z16.h, z2.h\n"
- ".inst 0x81822681 // bfmopa za1.s, p1/M, p1/M, z20.h, z2.h\n"
- ".inst 0x81822702 // bfmopa za2.s, p1/M, p1/M, z24.h, z2.h\n"
- ".inst 0x81822783 // bfmopa za3.s, p1/M, p1/M, z28.h, z2.h\n"
- ".inst 0xa143a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0x81932400 // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0x81932481 // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+ ".inst 0x81932502 // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+ ".inst 0x81932583 // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+ ".inst 0xa140a360 // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+ ".inst 0x81962620 // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+ "ldnt1h { z19.h }, p1/Z, [x23]\n"
+ ".inst 0x819626a1 // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+ ".inst 0x81962722 // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+ ".inst 0x819627a3 // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+ ".inst 0xa141a371 // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0x81972600 // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+ "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0x81972681 // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+ ".inst 0x81972702 // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+ ".inst 0x81972783 // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+ ".inst 0xa142a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x81822460 // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+ ".inst 0x818224e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+ ".inst 0x81822562 // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+ ".inst 0x818225e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
+ ".inst 0xa143a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x818d2460 // bfmopa za0.s, p1/M, p1/M, z3.h, z13.h\n"
- ".inst 0x818d24e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z13.h\n"
- ".inst 0x818d2562 // bfmopa za2.s, p1/M, p1/M, z11.h, z13.h\n"
- ".inst 0x818d25e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z13.h\n"
- ".inst 0x81952640 // bfmopa za0.s, p1/M, p1/M, z18.h, z21.h\n"
- ".inst 0x819526c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z21.h\n"
- ".inst 0x81952742 // bfmopa za2.s, p1/M, p1/M, z26.h, z21.h\n"
- ".inst 0x819527c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z21.h\n"
- ".inst 0x81912660 // bfmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
- ".inst 0x819126e1 // bfmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
- ".inst 0x81912762 // bfmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
- ".inst 0x819127e3 // bfmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
- ".inst 0x81822600 // bfmopa za0.s, p1/M, p1/M, z16.h, z2.h\n"
- ".inst 0x81822681 // bfmopa za1.s, p1/M, p1/M, z20.h, z2.h\n"
- ".inst 0x81822702 // bfmopa za2.s, p1/M, p1/M, z24.h, z2.h\n"
- ".inst 0x81822783 // bfmopa za3.s, p1/M, p1/M, z28.h, z2.h\n"
+ ".inst 0x81932400 // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
+ ".inst 0x81932481 // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+ ".inst 0x81932502 // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+ ".inst 0x81932583 // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+ ".inst 0x81962620 // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+ ".inst 0x819626a1 // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+ ".inst 0x81962722 // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+ ".inst 0x819627a3 // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+ ".inst 0x81972600 // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+ ".inst 0x81972681 // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+ ".inst 0x81972702 // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+ ".inst 0x81972783 // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+ ".inst 0x81822460 // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+ ".inst 0x818224e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+ ".inst 0x81822562 // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+ ".inst 0x818225e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa140a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa140a373 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27]\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1h { z2.h }, p1/Z, [x23]\n"
+ "ld1h { z11.h }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0x81822600 // bfmopa za0.s, p1/M, p1/M, z16.h, z2.h\n"
- ".inst 0x81822681 // bfmopa za1.s, p1/M, p1/M, z20.h, z2.h\n"
- ".inst 0x81822702 // bfmopa za2.s, p1/M, p1/M, z24.h, z2.h\n"
- ".inst 0x81822783 // bfmopa za3.s, p1/M, p1/M, z28.h, z2.h\n"
+ ".inst 0x818b2660 // bfmopa za0.s, p1/M, p1/M, z19.h, z11.h\n"
+ ".inst 0x818b26e1 // bfmopa za1.s, p1/M, p1/M, z23.h, z11.h\n"
+ ".inst 0x818b2762 // bfmopa za2.s, p1/M, p1/M, z27.h, z11.h\n"
+ ".inst 0x818b27e3 // bfmopa za3.s, p1/M, p1/M, z31.h, z11.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -241,25 +241,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa060c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 42f\n"
@@ -267,148 +267,148 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1cc // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 42f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10, LSL #2\n" // C += n
"sub x25, x13, x11\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
- "add x26, x26, x10, LSL #2\n" // C += n
"madd x26, x11, x24, x26\n" // C += m * ldc
"tbz x16, #2, 27f\n"
"cntw x23\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z15.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- "st1w { z0.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- "st1w { z1.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
- "st1w { z2.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Skip activation: Accumulator row 1 loop
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ "st1w { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z15.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 18b\n"
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ "st1w { z24.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z26.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Skip activation: Accumulator row 2 loop
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z8.s }, p0, [x26]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z9.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z10.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z11.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 21b\n"
"22:" // Store to output array: Skip activation: Accumulator row 2 oddments
"cbz x20, 23f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ "st1w { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z14.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Skip activation: Accumulator row 3 loop
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- "add x12, x12, #0x4\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 24b\n"
@@ -431,63 +431,63 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"b 40f\n"
"27:" // Store to output array: Skip activation: End
"cntw x23\n"
- "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
- "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 29f\n"
"28:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 28b\n"
"29:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 30f\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"30:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 40f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 32f\n"
"31:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "add x12, x12, #0x4\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 31b\n"
"32:" // Store to output array: Accumulator row 1 oddments
@@ -508,100 +508,100 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"subs x25, x25, x22\n"
"beq 40f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 35f\n"
"34:" // Store to output array: Accumulator row 2 loop
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "add x12, x12, #0x4\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 34b\n"
"35:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 36f\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
- "st1w { z28.s }, p0, [x26]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 36f\n"
"subs x20, x20, #0x1\n"
- "st1w { z29.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 36f\n"
- "st1w { z30.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"36:" // Store to output array: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 40f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 38f\n"
"37:" // Store to output array: Accumulator row 3 loop
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- "add x12, x12, #0x4\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 37b\n"
"38:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 39f\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 39f\n"
"subs x20, x20, #0x1\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 39f\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"39:" // Store to output array: Accumulator row 3 oddments: End
"40:" // Store to output array: End
"tbz x16, #0, 42f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"41:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 41b\n"
"42:" // End block
"incw x10\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #4\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp
index 82aaa4da49..c7fbede54e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SME2
+#ifdef ARM_COMPUTE_ENABLE_SVE
#include "../std_transforms_sme.hpp"
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
class cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL
{
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 2> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const CPUInfo *)
{
@@ -91,4 +90,4 @@ public:
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp
index 832fd0998a..871b154ad1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef ARM_COMPUTE_ENABLE_SME2
+#ifdef __ARM_FEATURE_SVE
#include "arm_gemm.hpp"
@@ -89,7 +89,6 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
const __fp16 *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -109,14 +108,14 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
".inst 0xa040c578 // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
- ".inst 0xa041c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
- ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa042c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
"addvl x11, x11, #16\n"
- ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"blt 1b\n"
@@ -133,17 +132,17 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
".inst 0x257a4770 // whilelt pn8.h, x27, x26, VLx2\n"
- "fmov z29.h, #0.0\n"
- "fmov z2.h, #1.0\n"
- ".inst 0xa01b229f // ldnt1h { z30.h-z31.h }, p8/Z, [x20, x27, LSL #1]\n"
- "zip1 z22.h, z30.h, z29.h\n"
- "zip2 z30.h, z30.h, z29.h\n"
- "zip1 z20.h, z31.h, z29.h\n"
- "zip2 z19.h, z31.h, z29.h\n"
- ".inst 0x81b60040 // fmopa za0.s, p0/M, p0/M, z2.h, z22.h\n"
- ".inst 0x81be0041 // fmopa za1.s, p0/M, p0/M, z2.h, z30.h\n"
- ".inst 0x81b40042 // fmopa za2.s, p0/M, p0/M, z2.h, z20.h\n"
- ".inst 0x81b30043 // fmopa za3.s, p0/M, p0/M, z2.h, z19.h\n"
+ "fmov z6.h, #0.0\n"
+ "fmov z19.h, #1.0\n"
+ ".inst 0xa01b2295 // ldnt1h { z20.h-z21.h }, p8/Z, [x20, x27, LSL #1]\n"
+ "zip1 z23.h, z20.h, z6.h\n"
+ "zip2 z12.h, z20.h, z6.h\n"
+ "zip1 z16.h, z21.h, z6.h\n"
+ "zip2 z8.h, z21.h, z6.h\n"
+ ".inst 0x81b70260 // fmopa za0.s, p0/M, p0/M, z19.h, z23.h\n"
+ ".inst 0x81ac0261 // fmopa za1.s, p0/M, p0/M, z19.h, z12.h\n"
+ ".inst 0x81b00262 // fmopa za2.s, p0/M, p0/M, z19.h, z16.h\n"
+ ".inst 0x81a80263 // fmopa za3.s, p0/M, p0/M, z19.h, z8.h\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x27\n"
"mov x21, x28\n"
@@ -162,79 +161,79 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
"add x20, x20, #0x1\n"
"lsr x20, x20, #0x1\n"
"lsr x21, x20, #0x2\n"
- "madd x23, x27, x22, x23\n" // bptr = B + n * kstride_bytes
"and x20, x20, #0x3\n"
+ "madd x23, x27, x22, x23\n" // bptr = B + n * kstride_bytes
"cbz x21, 8f\n"
"subs x21, x21, #0x1\n"
- "ld1h { z20.h }, p0/Z, [x24]\n"
- ".inst 0xa040a6f0 // ld1h { z16.h-z19.h }, pn9.b/Z, [x23]\n"
- "ld1h { z31.h }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0xa141a6e2 // ld1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1h { z28.h }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0xa042a6f8 // ld1h { z24.h-z27.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1h { z22.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z21.h }, p0/Z, [x24]\n"
+ ".inst 0xa140a6f8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23]\n"
+ "ld1h { z29.h }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa042a6e1 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1h { z25.h }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- ".inst 0xa143a6e1 // ld1h { z1.h, z5.h, z9.h, z13.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x81b00280 // fmopa za0.s, p0/M, p0/M, z20.h, z16.h\n"
+ ".inst 0x81b002a0 // fmopa za0.s, p0/M, p0/M, z21.h, z16.h\n"
"subs x21, x21, #0x1\n"
- ".inst 0x81b10281 // fmopa za1.s, p0/M, p0/M, z20.h, z17.h\n"
- ".inst 0x81b20282 // fmopa za2.s, p0/M, p0/M, z20.h, z18.h\n"
- ".inst 0x81b30283 // fmopa za3.s, p0/M, p0/M, z20.h, z19.h\n"
- "ld1h { z20.h }, p0/Z, [x24]\n"
- ".inst 0x81a203e0 // fmopa za0.s, p0/M, p0/M, z31.h, z2.h\n"
- ".inst 0xa040a6f0 // ld1h { z16.h-z19.h }, pn9.b/Z, [x23]\n"
- ".inst 0x81a603e1 // fmopa za1.s, p0/M, p0/M, z31.h, z6.h\n"
- ".inst 0x81aa03e2 // fmopa za2.s, p0/M, p0/M, z31.h, z10.h\n"
- ".inst 0x81ae03e3 // fmopa za3.s, p0/M, p0/M, z31.h, z14.h\n"
- "ld1h { z31.h }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0x81b80380 // fmopa za0.s, p0/M, p0/M, z28.h, z24.h\n"
- ".inst 0xa141a6e2 // ld1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x81b90381 // fmopa za1.s, p0/M, p0/M, z28.h, z25.h\n"
- ".inst 0x81ba0382 // fmopa za2.s, p0/M, p0/M, z28.h, z26.h\n"
- ".inst 0x81bb0383 // fmopa za3.s, p0/M, p0/M, z28.h, z27.h\n"
- "ld1h { z28.h }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0xa042a6f8 // ld1h { z24.h-z27.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0x81a102c0 // fmopa za0.s, p0/M, p0/M, z22.h, z1.h\n"
- ".inst 0x81a502c1 // fmopa za1.s, p0/M, p0/M, z22.h, z5.h\n"
- ".inst 0x81a902c2 // fmopa za2.s, p0/M, p0/M, z22.h, z9.h\n"
- ".inst 0x81ad02c3 // fmopa za3.s, p0/M, p0/M, z22.h, z13.h\n"
- "ld1h { z22.h }, p0/Z, [x24, #3, MUL VL]\n"
+ ".inst 0x81b402a1 // fmopa za1.s, p0/M, p0/M, z21.h, z20.h\n"
+ ".inst 0x81b802a2 // fmopa za2.s, p0/M, p0/M, z21.h, z24.h\n"
+ ".inst 0x81bc02a3 // fmopa za3.s, p0/M, p0/M, z21.h, z28.h\n"
+ "ld1h { z21.h }, p0/Z, [x24]\n"
+ ".inst 0x81ac03a0 // fmopa za0.s, p0/M, p0/M, z29.h, z12.h\n"
+ ".inst 0xa140a6f0 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23]\n"
+ ".inst 0x81ad03a1 // fmopa za1.s, p0/M, p0/M, z29.h, z13.h\n"
+ ".inst 0x81ae03a2 // fmopa za2.s, p0/M, p0/M, z29.h, z14.h\n"
+ ".inst 0x81af03a3 // fmopa za3.s, p0/M, p0/M, z29.h, z15.h\n"
+ "ld1h { z29.h }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x81a00080 // fmopa za0.s, p0/M, p0/M, z4.h, z0.h\n"
+ ".inst 0xa041a6ec // ld1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x81a10081 // fmopa za1.s, p0/M, p0/M, z4.h, z1.h\n"
+ ".inst 0x81a20082 // fmopa za2.s, p0/M, p0/M, z4.h, z2.h\n"
+ ".inst 0x81a30083 // fmopa za3.s, p0/M, p0/M, z4.h, z3.h\n"
+ "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa042a6e0 // ld1h { z0.h-z3.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0x81b30320 // fmopa za0.s, p0/M, p0/M, z25.h, z19.h\n"
+ ".inst 0x81b70321 // fmopa za1.s, p0/M, p0/M, z25.h, z23.h\n"
+ ".inst 0x81bb0322 // fmopa za2.s, p0/M, p0/M, z25.h, z27.h\n"
+ ".inst 0x81bf0323 // fmopa za3.s, p0/M, p0/M, z25.h, z31.h\n"
+ "ld1h { z25.h }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- ".inst 0xa143a6e1 // ld1h { z1.h, z5.h, z9.h, z13.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa143a6f3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x81b00280 // fmopa za0.s, p0/M, p0/M, z20.h, z16.h\n"
- ".inst 0x81b10281 // fmopa za1.s, p0/M, p0/M, z20.h, z17.h\n"
- ".inst 0x81b20282 // fmopa za2.s, p0/M, p0/M, z20.h, z18.h\n"
- ".inst 0x81b30283 // fmopa za3.s, p0/M, p0/M, z20.h, z19.h\n"
- ".inst 0x81a203e0 // fmopa za0.s, p0/M, p0/M, z31.h, z2.h\n"
- ".inst 0x81a603e1 // fmopa za1.s, p0/M, p0/M, z31.h, z6.h\n"
- ".inst 0x81aa03e2 // fmopa za2.s, p0/M, p0/M, z31.h, z10.h\n"
- ".inst 0x81ae03e3 // fmopa za3.s, p0/M, p0/M, z31.h, z14.h\n"
- ".inst 0x81b80380 // fmopa za0.s, p0/M, p0/M, z28.h, z24.h\n"
- ".inst 0x81b90381 // fmopa za1.s, p0/M, p0/M, z28.h, z25.h\n"
- ".inst 0x81ba0382 // fmopa za2.s, p0/M, p0/M, z28.h, z26.h\n"
- ".inst 0x81bb0383 // fmopa za3.s, p0/M, p0/M, z28.h, z27.h\n"
- ".inst 0x81a102c0 // fmopa za0.s, p0/M, p0/M, z22.h, z1.h\n"
- ".inst 0x81a502c1 // fmopa za1.s, p0/M, p0/M, z22.h, z5.h\n"
- ".inst 0x81a902c2 // fmopa za2.s, p0/M, p0/M, z22.h, z9.h\n"
- ".inst 0x81ad02c3 // fmopa za3.s, p0/M, p0/M, z22.h, z13.h\n"
+ ".inst 0x81b002a0 // fmopa za0.s, p0/M, p0/M, z21.h, z16.h\n"
+ ".inst 0x81b402a1 // fmopa za1.s, p0/M, p0/M, z21.h, z20.h\n"
+ ".inst 0x81b802a2 // fmopa za2.s, p0/M, p0/M, z21.h, z24.h\n"
+ ".inst 0x81bc02a3 // fmopa za3.s, p0/M, p0/M, z21.h, z28.h\n"
+ ".inst 0x81ac03a0 // fmopa za0.s, p0/M, p0/M, z29.h, z12.h\n"
+ ".inst 0x81ad03a1 // fmopa za1.s, p0/M, p0/M, z29.h, z13.h\n"
+ ".inst 0x81ae03a2 // fmopa za2.s, p0/M, p0/M, z29.h, z14.h\n"
+ ".inst 0x81af03a3 // fmopa za3.s, p0/M, p0/M, z29.h, z15.h\n"
+ ".inst 0x81a00080 // fmopa za0.s, p0/M, p0/M, z4.h, z0.h\n"
+ ".inst 0x81a10081 // fmopa za1.s, p0/M, p0/M, z4.h, z1.h\n"
+ ".inst 0x81a20082 // fmopa za2.s, p0/M, p0/M, z4.h, z2.h\n"
+ ".inst 0x81a30083 // fmopa za3.s, p0/M, p0/M, z4.h, z3.h\n"
+ ".inst 0x81b30320 // fmopa za0.s, p0/M, p0/M, z25.h, z19.h\n"
+ ".inst 0x81b70321 // fmopa za1.s, p0/M, p0/M, z25.h, z23.h\n"
+ ".inst 0x81bb0322 // fmopa za2.s, p0/M, p0/M, z25.h, z27.h\n"
+ ".inst 0x81bf0323 // fmopa za3.s, p0/M, p0/M, z25.h, z31.h\n"
"8:" // K oddments
"cbz x20, 10f\n"
"9:" // K oddments: Loop
- "ld1h { z10.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p0/Z, [x24]\n"
"subs x20, x20, #0x1\n"
"addvl x24, x24, #1\n"
- ".inst 0xa140a6f3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
+ ".inst 0xa140a6f0 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0x81b30140 // fmopa za0.s, p0/M, p0/M, z10.h, z19.h\n"
- ".inst 0x81b70141 // fmopa za1.s, p0/M, p0/M, z10.h, z23.h\n"
- ".inst 0x81bb0142 // fmopa za2.s, p0/M, p0/M, z10.h, z27.h\n"
- ".inst 0x81bf0143 // fmopa za3.s, p0/M, p0/M, z10.h, z31.h\n"
+ ".inst 0x81b002a0 // fmopa za0.s, p0/M, p0/M, z21.h, z16.h\n"
+ ".inst 0x81b402a1 // fmopa za1.s, p0/M, p0/M, z21.h, z20.h\n"
+ ".inst 0x81b802a2 // fmopa za2.s, p0/M, p0/M, z21.h, z24.h\n"
+ ".inst 0x81bc02a3 // fmopa za3.s, p0/M, p0/M, z21.h, z28.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x13, #1, 14f\n"
@@ -242,21 +241,21 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xa040c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
- ".inst 0xa041c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa042c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xa043c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa043c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
"addvl x11, x11, #16\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa060c554 // st1w { z20.s-z23.s }, pn9.b, [x10]\n"
- ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa060c540 // st1w { z0.s-z3.s }, pn9.b, [x10]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
".inst 0xa061c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0x4, MUL VL]\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
".inst 0xa062c544 // st1w { z4.s-z7.s }, pn9.b, [x10, #0x8, MUL VL]\n"
"cmp x12, x20\n"
@@ -268,16 +267,16 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c548 // st1w { z8.s-z11.s }, pn9.b, [x10]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa060c540 // st1w { z0.s-z3.s }, pn9.b, [x10]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c544 // st1w { z4.s-z7.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ ".inst 0xa061c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
"cmp x12, x20\n"
".inst 0xa062c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0x8, MUL VL]\n"
- ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ ".inst 0xa063c544 // st1w { z4.s-z7.s }, pn9.b, [x10, #0xc, MUL VL]\n"
"addvl x10, x10, #16\n"
"blt 13b\n"
"b 18f\n"
@@ -285,22 +284,22 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
"ldr x23, [%x[args], %[offsetof_C]]\n"
"sub x22, x9, x28\n"
"cntw x21\n"
- "ld1rh { z21.h }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rh { z17.h }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"ldr x20, [%x[args], %[offsetof_ldcb]]\n"
".inst 0x257a4770 // whilelt pn8.h, x27, x26, VLx2\n"
"cmp x22, x21\n"
- "ld1rh { z20.h }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rh { z16.h }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"csel x22, x22, x21, LT\n"
"add x23, x23, x27, LSL #1\n" // C += n
"madd x23, x28, x20, x23\n" // C += m * ldc
"15:" // Store to output array: Accumulator loop
- ".inst 0xc0060410 // mova { z16.b-z19.b }, za0h.b[x12, 0:3]\n"
+ ".inst 0xc0060414 // mova { z20.b-z23.b }, za0h.b[x12, 0:3]\n"
"add x12, x12, #0x4\n"
- ".inst 0xc120e20e // fcvt z14.h, { z16.s-z17.s }\n"
- ".inst 0xc120e24f // fcvt z15.h, { z18.s-z19.s }\n"
+ ".inst 0xc120e28e // fcvt z14.h, { z20.s-z21.s }\n"
+ ".inst 0xc120e2cf // fcvt z15.h, { z22.s-z23.s }\n"
"cmp x12, x22, LSL #2\n"
- ".inst 0xc174c2ae // fclamp { z14.h-z15.h }, z21.h, z20.h\n"
+ ".inst 0xc170c22e // fclamp { z14.h-z15.h }, z17.h, z16.h\n"
".inst 0xa06022ee // st1h { z14.h-z15.h }, p8, [x23]\n"
"add x23, x23, x20\n"
"blt 15b\n"
@@ -309,15 +308,15 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
"mov x12, #0x0\n"
"cntw x20\n"
"17:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11]\n"
- ".inst 0xa041c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
- ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xa043c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa040c578 // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xa041c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa043c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
"addvl x11, x11, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840681 // mova za1h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"blt 17b\n"
@@ -339,4 +338,4 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, c
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp
index 66d32acda7..929b0c97ef 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SME2
+#ifdef ARM_COMPUTE_ENABLE_SVE
#include "../std_transforms_sme.hpp"
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const __fp16 *const A, c
class cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL
{
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 2> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const CPUInfo *)
{
@@ -91,4 +90,4 @@ public:
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp
index 23e053c0f5..969fb41a92 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef ARM_COMPUTE_ENABLE_SME2
+#ifdef __ARM_FEATURE_SVE
#include "arm_gemm.hpp"
@@ -89,7 +89,6 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const __fp16 *const A, c
const __fp16 *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -98,61 +97,61 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const __fp16 *const A, c
KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ "ldr x16, [%x[args], %[offsetof_flags]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"ptrue p1.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
"ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
- "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
- "tbz x15, #0, 2f\n"
+ "tbz x16, #0, 2f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14]\n"
- ".inst 0xa041c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840681 // mova za1h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
- "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "ldr w13, [%x[args], %[offsetof_M]]\n"
+ "mov x11, #0x0\n"
"mov x10, #0x0\n"
- "mov x9, #0x0\n"
- "ldr w28, [%x[args], %[offsetof_N]]\n"
- "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "ldr w9, [%x[args], %[offsetof_N]]\n"
+ "ldr x28, [%x[args], %[offsetof_A]]\n"
"3:" // M and N loop
- "mov x26, x27\n"
- "tbnz x15, #0, 4f\n"
+ "mov x27, x28\n"
+ "tbnz x16, #0, 4f\n"
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "whilelt p0.h, x9, x28\n"
- "fmov z7.h, #0.0\n"
- "fmov z19.h, #1.0\n"
- "ld1h { z20.h }, p0/Z, [x20, x9, LSL #1]\n"
- "zip1 z21.h, z20.h, z7.h\n"
- "zip2 z30.h, z20.h, z7.h\n"
- ".inst 0x81b52660 // fmopa za0.s, p1/M, p1/M, z19.h, z21.h\n"
- ".inst 0x81be2661 // fmopa za1.s, p1/M, p1/M, z19.h, z30.h\n"
- ".inst 0x81b52662 // fmopa za2.s, p1/M, p1/M, z19.h, z21.h\n"
- ".inst 0x81be2663 // fmopa za3.s, p1/M, p1/M, z19.h, z30.h\n"
+ "whilelt p0.h, x10, x9\n"
+ "fmov z10.h, #0.0\n"
+ "fmov z11.h, #1.0\n"
+ "ld1h { z18.h }, p0/Z, [x20, x10, LSL #1]\n"
+ "zip1 z2.h, z18.h, z10.h\n"
+ "zip2 z19.h, z18.h, z10.h\n"
+ ".inst 0x81a22560 // fmopa za0.s, p1/M, p1/M, z11.h, z2.h\n"
+ ".inst 0x81b32561 // fmopa za1.s, p1/M, p1/M, z11.h, z19.h\n"
+ ".inst 0x81a22562 // fmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+ ".inst 0x81b32563 // fmopa za3.s, p1/M, p1/M, z11.h, z19.h\n"
"4:" // Prepare accumulators: Test for last block
- "mov x20, x9\n"
- "mov x21, x10\n"
+ "mov x20, x10\n"
+ "mov x21, x11\n"
"incw x20, ALL, MUL #2\n"
"incw x21, ALL, MUL #2\n"
- "cmp x20, x28\n"
- "mov x20, x15\n"
- "csel x21, x10, x21, LT\n"
- "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
- "cmp x21, x11\n"
- "csel x15, x20, x15, LT\n"
+ "cmp x20, x9\n"
+ "mov x20, x16\n"
+ "csel x21, x11, x21, LT\n"
+ "bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
+ "cmp x21, x13\n"
+ "csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
"ldr x23, [%x[args], %[offsetof_B]]\n"
@@ -160,185 +159,294 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const __fp16 *const A, c
"add x20, x20, #0x1\n"
"lsr x20, x20, #0x1\n"
"lsr x21, x20, #0x2\n"
- "madd x23, x9, x22, x23\n" // bptr = B + n * kstride_bytes
"and x20, x20, #0x3\n"
+ "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
"cbz x21, 8f\n"
"subs x21, x21, #0x1\n"
- ".inst 0xa0402344 // ld1h { z4.h-z5.h }, pn8.b/Z, [x26]\n"
- ".inst 0xa14022f1 // ld1h { z17.h, z25.h }, pn8.b/Z, [x23]\n"
- ".inst 0xa0412352 // ld1h { z18.h-z19.h }, pn8.b/Z, [x26, #0x2, MUL VL]\n"
- ".inst 0xa14122e3 // ld1h { z3.h, z11.h }, pn8.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa042234c // ld1h { z12.h-z13.h }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa04222fc // ld1h { z28.h-z29.h }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1432347 // ld1h { z7.h, z15.h }, pn8.b/Z, [x26, #0x6, MUL VL]\n"
- "addvl x26, x26, #8\n"
- ".inst 0xa14322f7 // ld1h { z23.h, z31.h }, pn8.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa0402374 // ld1h { z20.h-z21.h }, pn8.b/Z, [x27]\n"
+ ".inst 0xa14022ed // ldnt1h { z5.h, z13.h }, pn8.b/Z, [x23]\n"
+ ".inst 0xa041236a // ld1h { z10.h-z11.h }, pn8.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa14122ec // ldnt1h { z4.h, z12.h }, pn8.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa0422372 // ld1h { z18.h-z19.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04222fb // ldnt1h { z26.h-z27.h }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1432366 // ld1h { z6.h, z14.h }, pn8.b/Z, [x27, #0x6, MUL VL]\n"
+ "addvl x27, x27, #8\n"
+ ".inst 0xa04322f9 // ldnt1h { z24.h-z25.h }, pn8.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x81b12480 // fmopa za0.s, p1/M, p1/M, z4.h, z17.h\n"
+ ".inst 0x81a52680 // fmopa za0.s, p1/M, p1/M, z20.h, z5.h\n"
"subs x21, x21, #0x1\n"
- ".inst 0x81b92481 // fmopa za1.s, p1/M, p1/M, z4.h, z25.h\n"
- ".inst 0x81b124a2 // fmopa za2.s, p1/M, p1/M, z5.h, z17.h\n"
- ".inst 0x81b924a3 // fmopa za3.s, p1/M, p1/M, z5.h, z25.h\n"
- ".inst 0xa0402344 // ld1h { z4.h-z5.h }, pn8.b/Z, [x26]\n"
- ".inst 0x81a32640 // fmopa za0.s, p1/M, p1/M, z18.h, z3.h\n"
- ".inst 0xa14022f1 // ld1h { z17.h, z25.h }, pn8.b/Z, [x23]\n"
- ".inst 0x81ab2641 // fmopa za1.s, p1/M, p1/M, z18.h, z11.h\n"
- ".inst 0x81a32662 // fmopa za2.s, p1/M, p1/M, z19.h, z3.h\n"
- ".inst 0x81ab2663 // fmopa za3.s, p1/M, p1/M, z19.h, z11.h\n"
- ".inst 0xa0412352 // ld1h { z18.h-z19.h }, pn8.b/Z, [x26, #0x2, MUL VL]\n"
- ".inst 0x81bc2580 // fmopa za0.s, p1/M, p1/M, z12.h, z28.h\n"
- ".inst 0xa14122e3 // ld1h { z3.h, z11.h }, pn8.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0x81bd2581 // fmopa za1.s, p1/M, p1/M, z12.h, z29.h\n"
- ".inst 0x81bc25a2 // fmopa za2.s, p1/M, p1/M, z13.h, z28.h\n"
- ".inst 0x81bd25a3 // fmopa za3.s, p1/M, p1/M, z13.h, z29.h\n"
- ".inst 0xa042234c // ld1h { z12.h-z13.h }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xa04222fc // ld1h { z28.h-z29.h }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x81b724e0 // fmopa za0.s, p1/M, p1/M, z7.h, z23.h\n"
- ".inst 0x81bf24e1 // fmopa za1.s, p1/M, p1/M, z7.h, z31.h\n"
- ".inst 0x81b725e2 // fmopa za2.s, p1/M, p1/M, z15.h, z23.h\n"
- ".inst 0x81bf25e3 // fmopa za3.s, p1/M, p1/M, z15.h, z31.h\n"
- ".inst 0xa1432347 // ld1h { z7.h, z15.h }, pn8.b/Z, [x26, #0x6, MUL VL]\n"
- "addvl x26, x26, #8\n"
- ".inst 0xa14322f7 // ld1h { z23.h, z31.h }, pn8.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0x81ad2681 // fmopa za1.s, p1/M, p1/M, z20.h, z13.h\n"
+ ".inst 0x81a526a2 // fmopa za2.s, p1/M, p1/M, z21.h, z5.h\n"
+ ".inst 0x81ad26a3 // fmopa za3.s, p1/M, p1/M, z21.h, z13.h\n"
+ ".inst 0xa0402374 // ld1h { z20.h-z21.h }, pn8.b/Z, [x27]\n"
+ ".inst 0x81a42540 // fmopa za0.s, p1/M, p1/M, z10.h, z4.h\n"
+ ".inst 0xa14022e5 // ld1h { z5.h, z13.h }, pn8.b/Z, [x23]\n"
+ ".inst 0x81ac2541 // fmopa za1.s, p1/M, p1/M, z10.h, z12.h\n"
+ ".inst 0x81a42562 // fmopa za2.s, p1/M, p1/M, z11.h, z4.h\n"
+ ".inst 0x81ac2563 // fmopa za3.s, p1/M, p1/M, z11.h, z12.h\n"
+ ".inst 0xa041236a // ld1h { z10.h-z11.h }, pn8.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0x81ba2640 // fmopa za0.s, p1/M, p1/M, z18.h, z26.h\n"
+ ".inst 0xa14122e4 // ld1h { z4.h, z12.h }, pn8.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0x81bb2641 // fmopa za1.s, p1/M, p1/M, z18.h, z27.h\n"
+ ".inst 0x81ba2662 // fmopa za2.s, p1/M, p1/M, z19.h, z26.h\n"
+ ".inst 0x81bb2663 // fmopa za3.s, p1/M, p1/M, z19.h, z27.h\n"
+ ".inst 0xa0422372 // ld1h { z18.h-z19.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04222fa // ld1h { z26.h-z27.h }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x81b824c0 // fmopa za0.s, p1/M, p1/M, z6.h, z24.h\n"
+ ".inst 0x81b924c1 // fmopa za1.s, p1/M, p1/M, z6.h, z25.h\n"
+ ".inst 0x81b825c2 // fmopa za2.s, p1/M, p1/M, z14.h, z24.h\n"
+ ".inst 0x81b925c3 // fmopa za3.s, p1/M, p1/M, z14.h, z25.h\n"
+ ".inst 0xa1432366 // ld1h { z6.h, z14.h }, pn8.b/Z, [x27, #0x6, MUL VL]\n"
+ "addvl x27, x27, #8\n"
+ ".inst 0xa04322f8 // ld1h { z24.h-z25.h }, pn8.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x81b12480 // fmopa za0.s, p1/M, p1/M, z4.h, z17.h\n"
- ".inst 0x81b92481 // fmopa za1.s, p1/M, p1/M, z4.h, z25.h\n"
- ".inst 0x81b124a2 // fmopa za2.s, p1/M, p1/M, z5.h, z17.h\n"
- ".inst 0x81b924a3 // fmopa za3.s, p1/M, p1/M, z5.h, z25.h\n"
- ".inst 0x81a32640 // fmopa za0.s, p1/M, p1/M, z18.h, z3.h\n"
- ".inst 0x81ab2641 // fmopa za1.s, p1/M, p1/M, z18.h, z11.h\n"
- ".inst 0x81a32662 // fmopa za2.s, p1/M, p1/M, z19.h, z3.h\n"
- ".inst 0x81ab2663 // fmopa za3.s, p1/M, p1/M, z19.h, z11.h\n"
- ".inst 0x81bc2580 // fmopa za0.s, p1/M, p1/M, z12.h, z28.h\n"
- ".inst 0x81bd2581 // fmopa za1.s, p1/M, p1/M, z12.h, z29.h\n"
- ".inst 0x81bc25a2 // fmopa za2.s, p1/M, p1/M, z13.h, z28.h\n"
- ".inst 0x81bd25a3 // fmopa za3.s, p1/M, p1/M, z13.h, z29.h\n"
- ".inst 0x81b724e0 // fmopa za0.s, p1/M, p1/M, z7.h, z23.h\n"
- ".inst 0x81bf24e1 // fmopa za1.s, p1/M, p1/M, z7.h, z31.h\n"
- ".inst 0x81b725e2 // fmopa za2.s, p1/M, p1/M, z15.h, z23.h\n"
- ".inst 0x81bf25e3 // fmopa za3.s, p1/M, p1/M, z15.h, z31.h\n"
+ ".inst 0x81a52680 // fmopa za0.s, p1/M, p1/M, z20.h, z5.h\n"
+ ".inst 0x81ad2681 // fmopa za1.s, p1/M, p1/M, z20.h, z13.h\n"
+ ".inst 0x81a526a2 // fmopa za2.s, p1/M, p1/M, z21.h, z5.h\n"
+ ".inst 0x81ad26a3 // fmopa za3.s, p1/M, p1/M, z21.h, z13.h\n"
+ ".inst 0x81a42540 // fmopa za0.s, p1/M, p1/M, z10.h, z4.h\n"
+ ".inst 0x81ac2541 // fmopa za1.s, p1/M, p1/M, z10.h, z12.h\n"
+ ".inst 0x81a42562 // fmopa za2.s, p1/M, p1/M, z11.h, z4.h\n"
+ ".inst 0x81ac2563 // fmopa za3.s, p1/M, p1/M, z11.h, z12.h\n"
+ ".inst 0x81ba2640 // fmopa za0.s, p1/M, p1/M, z18.h, z26.h\n"
+ ".inst 0x81bb2641 // fmopa za1.s, p1/M, p1/M, z18.h, z27.h\n"
+ ".inst 0x81ba2662 // fmopa za2.s, p1/M, p1/M, z19.h, z26.h\n"
+ ".inst 0x81bb2663 // fmopa za3.s, p1/M, p1/M, z19.h, z27.h\n"
+ ".inst 0x81b824c0 // fmopa za0.s, p1/M, p1/M, z6.h, z24.h\n"
+ ".inst 0x81b924c1 // fmopa za1.s, p1/M, p1/M, z6.h, z25.h\n"
+ ".inst 0x81b825c2 // fmopa za2.s, p1/M, p1/M, z14.h, z24.h\n"
+ ".inst 0x81b925c3 // fmopa za3.s, p1/M, p1/M, z14.h, z25.h\n"
"8:" // K oddments
"cbz x20, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa0402346 // ld1h { z6.h-z7.h }, pn8.b/Z, [x26]\n"
+ ".inst 0xa0402374 // ld1h { z20.h-z21.h }, pn8.b/Z, [x27]\n"
"subs x20, x20, #0x1\n"
- "addvl x26, x26, #2\n"
- ".inst 0xa04022e0 // ld1h { z0.h-z1.h }, pn8.b/Z, [x23]\n"
+ "addvl x27, x27, #2\n"
+ ".inst 0xa14022e5 // ld1h { z5.h, z13.h }, pn8.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0x81a024c0 // fmopa za0.s, p1/M, p1/M, z6.h, z0.h\n"
- ".inst 0x81a124c1 // fmopa za1.s, p1/M, p1/M, z6.h, z1.h\n"
- ".inst 0x81a024e2 // fmopa za2.s, p1/M, p1/M, z7.h, z0.h\n"
- ".inst 0x81a124e3 // fmopa za3.s, p1/M, p1/M, z7.h, z1.h\n"
+ ".inst 0x81a52680 // fmopa za0.s, p1/M, p1/M, z20.h, z5.h\n"
+ ".inst 0x81ad2681 // fmopa za1.s, p1/M, p1/M, z20.h, z13.h\n"
+ ".inst 0x81a526a2 // fmopa za2.s, p1/M, p1/M, z21.h, z5.h\n"
+ ".inst 0x81ad26a3 // fmopa za3.s, p1/M, p1/M, z21.h, z13.h\n"
"bgt 9b\n"
"10:" // K oddments: End
- "tbz x15, #1, 14f\n"
- "tbz x15, #0, 12f\n"
+ "tbz x16, #1, 14f\n"
+ "tbz x16, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xa041c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa042c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa060c1ac // st1w { z12.s-z15.s }, pn8.b, [x13]\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa061c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa060c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13, #0xc, MUL VL]\n"
- "addvl x13, x13, #16\n"
+ ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ "addvl x14, x14, #16\n"
"blt 11b\n"
- "b 18f\n"
+ "b 23f\n"
"12:" // Store to partial result buffer: Store only
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa060c1dc // st1w { z28.s-z31.s }, pn8.b, [x14]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c1bc // st1w { z28.s-z31.s }, pn8.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13, #0xc, MUL VL]\n"
- "addvl x13, x13, #16\n"
+ ".inst 0xa062c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ "addvl x14, x14, #16\n"
"blt 13b\n"
- "b 18f\n"
+ "b 23f\n"
"14:" // Store to output array
- "ldr x25, [%x[args], %[offsetof_C]]\n"
- "sub x24, x11, x10\n"
- "cntw x23, ALL, MUL #2\n"
- "ld1rh { z18.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
- "whilelt p0.h, x9, x28\n"
- "cmp x24, x23\n"
- "ld1rh { z17.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ldr x26, [%x[args], %[offsetof_C]]\n"
+ "sub x25, x13, x11\n"
+ "cntw x24\n"
+ "ld1rh { z20.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "whilelt p0.h, x10, x9\n"
+ "cmp x25, x24\n"
+ "ld1rh { z19.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "csel x22, x25, x24, LT\n"
"mov x12, #0x0\n"
- "mov x21, #0x0\n"
- "add x25, x25, x9, LSL #1\n" // C += n
- "mov x20, #0x2\n"
- "madd x25, x10, x22, x25\n" // C += m * ldc
- "csel x24, x24, x23, LT\n"
- "15:" // Store to output array: Accumulator loop
- ".inst 0xc006000e // mova { z14.b-z15.b }, za0h.b[x12, 0:1]\n"
+ "add x26, x26, x10, LSL #1\n" // C += n
+ "lsr x21, x22, #0x2\n"
+ "madd x26, x11, x23, x26\n" // C += m * ldc
+ "and x20, x22, #0x3\n"
+ "cbz x21, 16f\n"
+ "15:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ "fcvt z8.h, p1/m, z8.s\n"
+ "fcvt z9.h, p1/m, z9.s\n"
+ "fcvt z10.h, p1/m, z10.s\n"
+ "fcvt z11.h, p1/m, z11.s\n"
"add x12, x12, #0x4\n"
- "cmp x12, x23, LSL #1\n"
- "add x21, x21, #0x1\n"
- ".inst 0xc120e1d0 // fcvt z16.h, { z14.s-z15.s }\n"
- "csel x12, x12, x20, LT\n"
- "cmp x21, x24\n"
- ".inst 0x64712650 // fclamp z16.h, z18.h, z17.h\n"
- "st1h { z16.h }, p0, [x25]\n"
- "add x25, x25, x22\n"
+ "fcvt z28.h, p1/m, z28.s\n"
+ "fcvt z29.h, p1/m, z29.s\n"
+ "cmp x12, x21, LSL #2\n"
+ "fcvt z30.h, p1/m, z30.s\n"
+ "fcvt z31.h, p1/m, z31.s\n"
+ ".inst 0xc173ca88 // fclamp { z8.h-z11.h }, z20.h, z19.h\n"
+ ".inst 0xc173ca9c // fclamp { z28.h-z31.h }, z20.h, z19.h\n"
+ "uzp1 z16.h, z8.h, z28.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "uzp1 z18.h, z9.h, z29.h\n"
+ "uzp1 z17.h, z10.h, z30.h\n"
+ "uzp1 z16.h, z11.h, z31.h\n"
+ "st1h { z18.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "st1h { z17.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
"blt 15b\n"
- "16:" // Store to output array: End
- "tbz x15, #0, 18f\n"
+ "16:" // Store to output array: Accumulator row 0 oddments
+ "cbz x20, 17f\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ "fcvt z8.h, p1/m, z8.s\n"
+ "fcvt z9.h, p1/m, z9.s\n"
+ "fcvt z10.h, p1/m, z10.s\n"
+ "fcvt z11.h, p1/m, z11.s\n"
+ "subs x20, x20, #0x1\n"
+ "fcvt z12.h, p1/m, z12.s\n"
+ "fcvt z13.h, p1/m, z13.s\n"
+ "fcvt z14.h, p1/m, z14.s\n"
+ "fcvt z15.h, p1/m, z15.s\n"
+ ".inst 0xc173ca88 // fclamp { z8.h-z11.h }, z20.h, z19.h\n"
+ ".inst 0xc173ca8c // fclamp { z12.h-z15.h }, z20.h, z19.h\n"
+ "uzp1 z16.h, z8.h, z12.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "beq 17f\n"
+ "subs x20, x20, #0x1\n"
+ "uzp1 z16.h, z9.h, z13.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "beq 17f\n"
+ "uzp1 z16.h, z10.h, z14.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "17:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x25, x25, x22\n"
+ "beq 21f\n"
+ "whilelt p0.h, x10, x9\n"
+ "cmp x25, x24\n"
+ "csel x20, x25, x24, LT\n"
+ "mov x12, #0x0\n"
+ "lsr x21, x20, #0x2\n"
+ "and x20, x20, #0x3\n"
+ "cbz x21, 19f\n"
+ "18:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ "fcvt z0.h, p1/m, z0.s\n"
+ "fcvt z1.h, p1/m, z1.s\n"
+ "fcvt z2.h, p1/m, z2.s\n"
+ "fcvt z3.h, p1/m, z3.s\n"
+ "add x12, x12, #0x4\n"
+ "fcvt z28.h, p1/m, z28.s\n"
+ "fcvt z29.h, p1/m, z29.s\n"
+ "cmp x12, x21, LSL #2\n"
+ "fcvt z30.h, p1/m, z30.s\n"
+ "fcvt z31.h, p1/m, z31.s\n"
+ ".inst 0xc173ca80 // fclamp { z0.h-z3.h }, z20.h, z19.h\n"
+ ".inst 0xc173ca9c // fclamp { z28.h-z31.h }, z20.h, z19.h\n"
+ "uzp1 z16.h, z0.h, z28.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "uzp1 z18.h, z1.h, z29.h\n"
+ "uzp1 z17.h, z2.h, z30.h\n"
+ "uzp1 z16.h, z3.h, z31.h\n"
+ "st1h { z18.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "st1h { z17.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Accumulator row 1 oddments
+ "cbz x20, 20f\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ "fcvt z28.h, p1/m, z28.s\n"
+ "fcvt z29.h, p1/m, z29.s\n"
+ "fcvt z30.h, p1/m, z30.s\n"
+ "fcvt z31.h, p1/m, z31.s\n"
+ "subs x20, x20, #0x1\n"
+ "fcvt z12.h, p1/m, z12.s\n"
+ "fcvt z13.h, p1/m, z13.s\n"
+ "fcvt z14.h, p1/m, z14.s\n"
+ "fcvt z15.h, p1/m, z15.s\n"
+ ".inst 0xc173ca9c // fclamp { z28.h-z31.h }, z20.h, z19.h\n"
+ ".inst 0xc173ca8c // fclamp { z12.h-z15.h }, z20.h, z19.h\n"
+ "uzp1 z16.h, z28.h, z12.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "beq 20f\n"
+ "subs x20, x20, #0x1\n"
+ "uzp1 z16.h, z29.h, z13.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "add x26, x26, x23\n"
+ "beq 20f\n"
+ "uzp1 z16.h, z30.h, z14.h\n"
+ "st1h { z16.h }, p0, [x26]\n"
+ "20:" // Store to output array: Accumulator row 1 oddments: End
+ "21:" // Store to output array: End
+ "tbz x16, #0, 23f\n"
"mov x12, #0x0\n"
"cntw x20\n"
- "17:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14]\n"
- ".inst 0xa041c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c1c8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ "22:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- "blt 17b\n"
- "18:" // End block
- "incw x9, ALL, MUL #2\n"
- "cmp x9, x28\n"
- "blt 3b\n"
+ "blt 22b\n"
+ "23:" // End block
"incw x10, ALL, MUL #2\n"
- "mov x9, #0x0\n"
- "cmp x10, x11\n"
- "mov x27, x26\n"
+ "cmp x10, x9\n"
+ "blt 3b\n"
+ "incw x11, ALL, MUL #2\n"
+ "mov x10, #0x0\n"
+ "cmp x11, x13\n"
+ "mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp
index f63eb30efd..dba440632d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SME2
+#ifdef ARM_COMPUTE_ENABLE_SVE
#include "../std_transforms_sme.hpp"
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
class cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL
{
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 2> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const CPUInfo *)
{
@@ -91,4 +90,4 @@ public:
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp
index 3d98d3fe16..ee66b4b95c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef ARM_COMPUTE_ENABLE_SME2
+#ifdef __ARM_FEATURE_SVE
#include "arm_gemm.hpp"
@@ -89,7 +89,6 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
const __fp16 *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -108,15 +107,15 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xa042c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa043c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
"addvl x15, x15, #16\n"
- ".inst 0xc0840681 // mova za1h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"blt 1b\n"
@@ -134,14 +133,14 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
"whilelt p0.h, x10, x9\n"
- "fmov z13.h, #0.0\n"
- "fmov z27.h, #1.0\n"
- "ld1h { z14.h }, p0/Z, [x20, x10, LSL #1]\n"
- "zip1 z30.h, z14.h, z13.h\n"
- ".inst 0x81be2760 // fmopa za0.s, p1/M, p1/M, z27.h, z30.h\n"
- ".inst 0x81be2761 // fmopa za1.s, p1/M, p1/M, z27.h, z30.h\n"
- ".inst 0x81be2762 // fmopa za2.s, p1/M, p1/M, z27.h, z30.h\n"
- ".inst 0x81be2763 // fmopa za3.s, p1/M, p1/M, z27.h, z30.h\n"
+ "fmov z5.h, #0.0\n"
+ "fmov z18.h, #1.0\n"
+ "ld1h { z31.h }, p0/Z, [x20, x10, LSL #1]\n"
+ "zip1 z15.h, z31.h, z5.h\n"
+ ".inst 0x81af2640 // fmopa za0.s, p1/M, p1/M, z18.h, z15.h\n"
+ ".inst 0x81af2641 // fmopa za1.s, p1/M, p1/M, z18.h, z15.h\n"
+ ".inst 0x81af2642 // fmopa za2.s, p1/M, p1/M, z18.h, z15.h\n"
+ ".inst 0x81af2643 // fmopa za3.s, p1/M, p1/M, z18.h, z15.h\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -160,79 +159,79 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"add x20, x20, #0x1\n"
"lsr x20, x20, #0x1\n"
"lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
"and x20, x20, #0x3\n"
+ "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
"cbz x21, 8f\n"
"subs x21, x21, #0x1\n"
- ".inst 0xa040a778 // ld1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- "ld1h { z1.h }, p1/Z, [x23]\n"
- ".inst 0xa041a764 // ld1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa042a77c // ld1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa043a76c // ld1h { z12.h-z15.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa140a773 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x27]\n"
+ "ldnt1h { z17.h }, p1/Z, [x23]\n"
+ ".inst 0xa041a76c // ld1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1h { z26.h }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa042a760 // ld1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1h { z30.h }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa143a770 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1h { z0.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1h { z18.h }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x81a12700 // fmopa za0.s, p1/M, p1/M, z24.h, z1.h\n"
+ ".inst 0x81b12660 // fmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
"subs x21, x21, #0x1\n"
- ".inst 0x81a12721 // fmopa za1.s, p1/M, p1/M, z25.h, z1.h\n"
- ".inst 0x81a12742 // fmopa za2.s, p1/M, p1/M, z26.h, z1.h\n"
- ".inst 0x81a12763 // fmopa za3.s, p1/M, p1/M, z27.h, z1.h\n"
- ".inst 0xa040a778 // ld1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
- ".inst 0x81b72480 // fmopa za0.s, p1/M, p1/M, z4.h, z23.h\n"
- "ld1h { z1.h }, p1/Z, [x23]\n"
- ".inst 0x81b724a1 // fmopa za1.s, p1/M, p1/M, z5.h, z23.h\n"
- ".inst 0x81b724c2 // fmopa za2.s, p1/M, p1/M, z6.h, z23.h\n"
- ".inst 0x81b724e3 // fmopa za3.s, p1/M, p1/M, z7.h, z23.h\n"
- ".inst 0xa041a764 // ld1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0x81b32780 // fmopa za0.s, p1/M, p1/M, z28.h, z19.h\n"
- "ld1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0x81b327a1 // fmopa za1.s, p1/M, p1/M, z29.h, z19.h\n"
- ".inst 0x81b327c2 // fmopa za2.s, p1/M, p1/M, z30.h, z19.h\n"
- ".inst 0x81b327e3 // fmopa za3.s, p1/M, p1/M, z31.h, z19.h\n"
- ".inst 0xa042a77c // ld1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0x81a02580 // fmopa za0.s, p1/M, p1/M, z12.h, z0.h\n"
- ".inst 0x81a025a1 // fmopa za1.s, p1/M, p1/M, z13.h, z0.h\n"
- ".inst 0x81a025c2 // fmopa za2.s, p1/M, p1/M, z14.h, z0.h\n"
- ".inst 0x81a025e3 // fmopa za3.s, p1/M, p1/M, z15.h, z0.h\n"
- ".inst 0xa043a76c // ld1h { z12.h-z15.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0x81b126e1 // fmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
+ ".inst 0x81b12762 // fmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
+ ".inst 0x81b127e3 // fmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
+ ".inst 0xa140a773 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x27]\n"
+ ".inst 0x81ba2580 // fmopa za0.s, p1/M, p1/M, z12.h, z26.h\n"
+ "ld1h { z17.h }, p1/Z, [x23]\n"
+ ".inst 0x81ba25a1 // fmopa za1.s, p1/M, p1/M, z13.h, z26.h\n"
+ ".inst 0x81ba25c2 // fmopa za2.s, p1/M, p1/M, z14.h, z26.h\n"
+ ".inst 0x81ba25e3 // fmopa za3.s, p1/M, p1/M, z15.h, z26.h\n"
+ ".inst 0xa041a76c // ld1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0x81be2400 // fmopa za0.s, p1/M, p1/M, z0.h, z30.h\n"
+ "ld1h { z26.h }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0x81be2421 // fmopa za1.s, p1/M, p1/M, z1.h, z30.h\n"
+ ".inst 0x81be2442 // fmopa za2.s, p1/M, p1/M, z2.h, z30.h\n"
+ ".inst 0x81be2463 // fmopa za3.s, p1/M, p1/M, z3.h, z30.h\n"
+ ".inst 0xa042a760 // ld1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+ "ld1h { z30.h }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x81b22600 // fmopa za0.s, p1/M, p1/M, z16.h, z18.h\n"
+ ".inst 0x81b22681 // fmopa za1.s, p1/M, p1/M, z20.h, z18.h\n"
+ ".inst 0x81b22702 // fmopa za2.s, p1/M, p1/M, z24.h, z18.h\n"
+ ".inst 0x81b22783 // fmopa za3.s, p1/M, p1/M, z28.h, z18.h\n"
+ ".inst 0xa143a770 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1h { z0.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x81a12700 // fmopa za0.s, p1/M, p1/M, z24.h, z1.h\n"
- ".inst 0x81a12721 // fmopa za1.s, p1/M, p1/M, z25.h, z1.h\n"
- ".inst 0x81a12742 // fmopa za2.s, p1/M, p1/M, z26.h, z1.h\n"
- ".inst 0x81a12763 // fmopa za3.s, p1/M, p1/M, z27.h, z1.h\n"
- ".inst 0x81b72480 // fmopa za0.s, p1/M, p1/M, z4.h, z23.h\n"
- ".inst 0x81b724a1 // fmopa za1.s, p1/M, p1/M, z5.h, z23.h\n"
- ".inst 0x81b724c2 // fmopa za2.s, p1/M, p1/M, z6.h, z23.h\n"
- ".inst 0x81b724e3 // fmopa za3.s, p1/M, p1/M, z7.h, z23.h\n"
- ".inst 0x81b32780 // fmopa za0.s, p1/M, p1/M, z28.h, z19.h\n"
- ".inst 0x81b327a1 // fmopa za1.s, p1/M, p1/M, z29.h, z19.h\n"
- ".inst 0x81b327c2 // fmopa za2.s, p1/M, p1/M, z30.h, z19.h\n"
- ".inst 0x81b327e3 // fmopa za3.s, p1/M, p1/M, z31.h, z19.h\n"
- ".inst 0x81a02580 // fmopa za0.s, p1/M, p1/M, z12.h, z0.h\n"
- ".inst 0x81a025a1 // fmopa za1.s, p1/M, p1/M, z13.h, z0.h\n"
- ".inst 0x81a025c2 // fmopa za2.s, p1/M, p1/M, z14.h, z0.h\n"
- ".inst 0x81a025e3 // fmopa za3.s, p1/M, p1/M, z15.h, z0.h\n"
+ ".inst 0x81b12660 // fmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
+ ".inst 0x81b126e1 // fmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
+ ".inst 0x81b12762 // fmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
+ ".inst 0x81b127e3 // fmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
+ ".inst 0x81ba2580 // fmopa za0.s, p1/M, p1/M, z12.h, z26.h\n"
+ ".inst 0x81ba25a1 // fmopa za1.s, p1/M, p1/M, z13.h, z26.h\n"
+ ".inst 0x81ba25c2 // fmopa za2.s, p1/M, p1/M, z14.h, z26.h\n"
+ ".inst 0x81ba25e3 // fmopa za3.s, p1/M, p1/M, z15.h, z26.h\n"
+ ".inst 0x81be2400 // fmopa za0.s, p1/M, p1/M, z0.h, z30.h\n"
+ ".inst 0x81be2421 // fmopa za1.s, p1/M, p1/M, z1.h, z30.h\n"
+ ".inst 0x81be2442 // fmopa za2.s, p1/M, p1/M, z2.h, z30.h\n"
+ ".inst 0x81be2463 // fmopa za3.s, p1/M, p1/M, z3.h, z30.h\n"
+ ".inst 0x81b22600 // fmopa za0.s, p1/M, p1/M, z16.h, z18.h\n"
+ ".inst 0x81b22681 // fmopa za1.s, p1/M, p1/M, z20.h, z18.h\n"
+ ".inst 0x81b22702 // fmopa za2.s, p1/M, p1/M, z24.h, z18.h\n"
+ ".inst 0x81b22783 // fmopa za3.s, p1/M, p1/M, z28.h, z18.h\n"
"8:" // K oddments
"cbz x20, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040a768 // ld1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa140a773 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x27]\n"
"subs x20, x20, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1h { z12.h }, p1/Z, [x23]\n"
+ "ld1h { z17.h }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0x81ac2500 // fmopa za0.s, p1/M, p1/M, z8.h, z12.h\n"
- ".inst 0x81ac2521 // fmopa za1.s, p1/M, p1/M, z9.h, z12.h\n"
- ".inst 0x81ac2542 // fmopa za2.s, p1/M, p1/M, z10.h, z12.h\n"
- ".inst 0x81ac2563 // fmopa za3.s, p1/M, p1/M, z11.h, z12.h\n"
+ ".inst 0x81b12660 // fmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
+ ".inst 0x81b126e1 // fmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
+ ".inst 0x81b12762 // fmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
+ ".inst 0x81b127e3 // fmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -240,25 +239,25 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
"addvl x15, x15, #16\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa060c5dc // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
- ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa060c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
".inst 0xa061c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n"
".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa063c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 29f\n"
@@ -266,16 +265,16 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa060c5dc // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa061c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 29f\n"
@@ -283,11 +282,11 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"ldr x26, [%x[args], %[offsetof_C]]\n"
"sub x25, x13, x11\n"
"cntw x24\n"
- "ld1rh { z21.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rh { z29.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"ldr x23, [%x[args], %[offsetof_ldcb]]\n"
"whilelt p0.s, x10, x9\n"
"cmp x25, x24\n"
- "ld1rh { z20.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rh { z28.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"csel x22, x25, x24, LT\n"
"mov x12, #0x0\n"
"add x26, x26, x10, LSL #1\n" // C += n
@@ -296,21 +295,21 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
"add x12, x12, #0x4\n"
- "fcvt z28.h, p1/m, z28.s\n"
- "fcvt z29.h, p1/m, z29.s\n"
+ "fcvt z0.h, p1/m, z0.s\n"
+ "fcvt z1.h, p1/m, z1.s\n"
+ "fcvt z2.h, p1/m, z2.s\n"
+ "fcvt z3.h, p1/m, z3.s\n"
"cmp x12, x21, LSL #2\n"
- "fcvt z30.h, p1/m, z30.s\n"
- "fcvt z31.h, p1/m, z31.s\n"
- ".inst 0xc174cabc // fclamp { z28.h-z31.h }, z21.h, z20.h\n"
- "st1h { z28.s }, p0, [x26]\n"
+ ".inst 0xc17ccba0 // fclamp { z0.h-z3.h }, z29.h, z28.h\n"
+ "st1h { z0.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z29.s }, p0, [x26]\n"
+ "st1h { z1.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z30.s }, p0, [x26]\n"
+ "st1h { z2.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z31.s }, p0, [x26]\n"
+ "st1h { z3.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 15b\n"
"16:" // Store to output array: Accumulator row 0 oddments
@@ -321,7 +320,7 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"fcvt z17.h, p1/m, z17.s\n"
"fcvt z18.h, p1/m, z18.s\n"
"fcvt z19.h, p1/m, z19.s\n"
- ".inst 0xc174cab0 // fclamp { z16.h-z19.h }, z21.h, z20.h\n"
+ ".inst 0xc17ccbb0 // fclamp { z16.h-z19.h }, z29.h, z28.h\n"
"st1h { z16.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
@@ -334,152 +333,155 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
"17:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x22, x25, x24, LT\n"
+ "mov x12, #0x0\n"
"lsr x21, x22, #0x2\n"
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
"add x12, x12, #0x4\n"
- "fcvt z0.h, p1/m, z0.s\n"
- "fcvt z1.h, p1/m, z1.s\n"
+ "fcvt z24.h, p1/m, z24.s\n"
+ "fcvt z25.h, p1/m, z25.s\n"
+ "fcvt z26.h, p1/m, z26.s\n"
+ "fcvt z27.h, p1/m, z27.s\n"
"cmp x12, x21, LSL #2\n"
- "fcvt z2.h, p1/m, z2.s\n"
- "fcvt z3.h, p1/m, z3.s\n"
- ".inst 0xc174caa0 // fclamp { z0.h-z3.h }, z21.h, z20.h\n"
- "st1h { z0.s }, p0, [x26]\n"
+ ".inst 0xc17ccbb8 // fclamp { z24.h-z27.h }, z29.h, z28.h\n"
+ "st1h { z24.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z1.s }, p0, [x26]\n"
+ "st1h { z25.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z2.s }, p0, [x26]\n"
+ "st1h { z26.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z3.s }, p0, [x26]\n"
+ "st1h { z27.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 18b\n"
"19:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 20f\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
"subs x20, x20, #0x1\n"
- "fcvt z24.h, p1/m, z24.s\n"
- "fcvt z25.h, p1/m, z25.s\n"
- "fcvt z26.h, p1/m, z26.s\n"
- "fcvt z27.h, p1/m, z27.s\n"
- ".inst 0xc174cab8 // fclamp { z24.h-z27.h }, z21.h, z20.h\n"
- "st1h { z24.s }, p0, [x26]\n"
+ "fcvt z0.h, p1/m, z0.s\n"
+ "fcvt z1.h, p1/m, z1.s\n"
+ "fcvt z2.h, p1/m, z2.s\n"
+ "fcvt z3.h, p1/m, z3.s\n"
+ ".inst 0xc17ccba0 // fclamp { z0.h-z3.h }, z29.h, z28.h\n"
+ "st1h { z0.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1h { z25.s }, p0, [x26]\n"
+ "st1h { z1.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
- "st1h { z26.s }, p0, [x26]\n"
+ "st1h { z2.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"20:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x22, x25, x24, LT\n"
+ "mov x12, #0x0\n"
"lsr x21, x22, #0x2\n"
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
"add x12, x12, #0x4\n"
- "fcvt z16.h, p1/m, z16.s\n"
- "fcvt z17.h, p1/m, z17.s\n"
+ "fcvt z20.h, p1/m, z20.s\n"
+ "fcvt z21.h, p1/m, z21.s\n"
+ "fcvt z22.h, p1/m, z22.s\n"
+ "fcvt z23.h, p1/m, z23.s\n"
"cmp x12, x21, LSL #2\n"
- "fcvt z18.h, p1/m, z18.s\n"
- "fcvt z19.h, p1/m, z19.s\n"
- ".inst 0xc174cab0 // fclamp { z16.h-z19.h }, z21.h, z20.h\n"
- "st1h { z16.s }, p0, [x26]\n"
+ ".inst 0xc17ccbb4 // fclamp { z20.h-z23.h }, z29.h, z28.h\n"
+ "st1h { z20.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z17.s }, p0, [x26]\n"
+ "st1h { z21.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z18.s }, p0, [x26]\n"
+ "st1h { z22.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z19.s }, p0, [x26]\n"
+ "st1h { z23.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 21b\n"
"22:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 23f\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
"subs x20, x20, #0x1\n"
- "fcvt z28.h, p1/m, z28.s\n"
- "fcvt z29.h, p1/m, z29.s\n"
- "fcvt z30.h, p1/m, z30.s\n"
- "fcvt z31.h, p1/m, z31.s\n"
- ".inst 0xc174cabc // fclamp { z28.h-z31.h }, z21.h, z20.h\n"
- "st1h { z28.s }, p0, [x26]\n"
+ "fcvt z12.h, p1/m, z12.s\n"
+ "fcvt z13.h, p1/m, z13.s\n"
+ "fcvt z14.h, p1/m, z14.s\n"
+ "fcvt z15.h, p1/m, z15.s\n"
+ ".inst 0xc17ccbac // fclamp { z12.h-z15.h }, z29.h, z28.h\n"
+ "st1h { z12.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 23f\n"
"subs x20, x20, #0x1\n"
- "st1h { z29.s }, p0, [x26]\n"
+ "st1h { z13.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 23f\n"
- "st1h { z30.s }, p0, [x26]\n"
+ "st1h { z14.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"23:" // Store to output array: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x20, x25, x24, LT\n"
+ "mov x12, #0x0\n"
"lsr x21, x20, #0x2\n"
"and x20, x20, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
"add x12, x12, #0x4\n"
- "fcvt z28.h, p1/m, z28.s\n"
- "fcvt z29.h, p1/m, z29.s\n"
+ "fcvt z4.h, p1/m, z4.s\n"
+ "fcvt z5.h, p1/m, z5.s\n"
+ "fcvt z6.h, p1/m, z6.s\n"
+ "fcvt z7.h, p1/m, z7.s\n"
"cmp x12, x21, LSL #2\n"
- "fcvt z30.h, p1/m, z30.s\n"
- "fcvt z31.h, p1/m, z31.s\n"
- ".inst 0xc174cabc // fclamp { z28.h-z31.h }, z21.h, z20.h\n"
- "st1h { z28.s }, p0, [x26]\n"
+ ".inst 0xc17ccba4 // fclamp { z4.h-z7.h }, z29.h, z28.h\n"
+ "st1h { z4.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z29.s }, p0, [x26]\n"
+ "st1h { z5.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z30.s }, p0, [x26]\n"
+ "st1h { z6.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1h { z31.s }, p0, [x26]\n"
+ "st1h { z7.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 24b\n"
"25:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 26f\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
"subs x20, x20, #0x1\n"
- "fcvt z28.h, p1/m, z28.s\n"
- "fcvt z29.h, p1/m, z29.s\n"
- "fcvt z30.h, p1/m, z30.s\n"
- "fcvt z31.h, p1/m, z31.s\n"
- ".inst 0xc174cabc // fclamp { z28.h-z31.h }, z21.h, z20.h\n"
- "st1h { z28.s }, p0, [x26]\n"
+ "fcvt z4.h, p1/m, z4.s\n"
+ "fcvt z5.h, p1/m, z5.s\n"
+ "fcvt z6.h, p1/m, z6.s\n"
+ "fcvt z7.h, p1/m, z7.s\n"
+ ".inst 0xc17ccba4 // fclamp { z4.h-z7.h }, z29.h, z28.h\n"
+ "st1h { z4.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 26f\n"
"subs x20, x20, #0x1\n"
- "st1h { z29.s }, p0, [x26]\n"
+ "st1h { z5.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 26f\n"
- "st1h { z30.s }, p0, [x26]\n"
+ "st1h { z6.s }, p0, [x26]\n"
"26:" // Store to output array: Accumulator row 3 oddments: End
"27:" // Store to output array: End
"tbz x16, #0, 29f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"28:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa043c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
"addvl x15, x15, #16\n"
".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"blt 28b\n"
@@ -501,4 +503,4 @@ void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, c
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
index e3e2a0639f..9486319cfb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
class cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL
{
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 1> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 1> transforms = {};
cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
index 1f9f08e401..19e5d52b53 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
B(B), kstride_bytes(K * sizeof(float)),
C(C), ldcb(ldc * sizeof(float)),
M(M), N(N), K(K),
+ n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
min(-std::numeric_limits<float>::infinity()),
max(std::numeric_limits<float>::infinity()),
bias(bias),
@@ -87,13 +88,12 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
const long kstride_bytes;
float *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();
const float *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -112,17 +112,17 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- "addvl x14, x14, #16\n"
".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x14, x14, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w11, [%x[args], %[offsetof_M]]\n"
@@ -137,101 +137,101 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z15.s, #1.0\n"
- ".inst 0xa109c280 // ld1w { z0.s, z4.s, z8.s, z12.s }, p8/Z, [x20, x9, LSL #2]\n"
- ".inst 0x808001e0 // fmopa za0.s, p0/M, p0/M, z15.s, z0.s\n"
- ".inst 0x808401e1 // fmopa za1.s, p0/M, p0/M, z15.s, z4.s\n"
- ".inst 0x808801e2 // fmopa za2.s, p0/M, p0/M, z15.s, z8.s\n"
- ".inst 0x808c01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z12.s\n"
+ "fmov z6.s, #1.0\n"
+ ".inst 0xa009c29d // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
+ ".inst 0x809c00c0 // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+ ".inst 0x809d00c1 // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+ ".inst 0x809e00c2 // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+ ".inst 0x809f00c3 // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x9\n"
"mov x21, x10\n"
"incw x20, ALL, MUL #4\n"
"incw x21\n"
"cmp x20, x28\n"
- "mov x20, x15\n"
"csel x21, x10, x21, LT\n"
+ "mov x20, x15\n"
"bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
"cmp x21, x11\n"
"csel x15, x20, x15, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
- "lsr x21, x20, #0x2\n"
- "and x20, x20, #0x3\n"
- "madd x23, x9, x22, x23\n" // bptr = B + n * kstride_bytes
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- "ld1w { z20.s }, p0/Z, [x26]\n"
- ".inst 0xa140c6f3 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x23]\n"
- "ld1w { z4.s }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa041c6ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1w { z29.s }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142c6f2 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1w { z2.s }, p0/Z, [x26, #3, MUL VL]\n"
+ "lsr x23, x20, #0x2\n"
+ "and x22, x20, #0x3\n"
+ "ldr x21, [%x[args], %[offsetof_B]]\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x21, x9, x20, x21\n" // bptr = B + n * kstride_bytes
+ "cbz x23, 8f\n"
+ "subs x23, x23, #0x1\n"
+ "ld1w { z28.s }, p0/Z, [x26]\n"
+ ".inst 0xa040c6a9 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+ "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0xa041c6ad // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042c6a5 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa043c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
- "addvl x23, x23, #16\n"
+ ".inst 0xa143c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+ "addvl x21, x21, #16\n"
"ble 7f\n"
"6:" // K loop
+ ".inst 0x80880380 // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x80890381 // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+ ".inst 0x808a0382 // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+ ".inst 0x808b0383 // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+ "ld1w { z28.s }, p0/Z, [x26]\n"
+ ".inst 0x808c02c0 // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+ ".inst 0xa040c6a9 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+ ".inst 0x808d02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+ ".inst 0x808e02c2 // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+ ".inst 0x808f02c3 // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+ "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x808403c0 // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+ ".inst 0xa041c6ad // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0x808503c1 // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+ ".inst 0x808603c2 // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+ ".inst 0x808703c3 // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+ "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042c6a5 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
".inst 0x80930280 // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
- "subs x21, x21, #0x1\n"
".inst 0x80970281 // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
".inst 0x809b0282 // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
".inst 0x809f0283 // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
- "ld1w { z20.s }, p0/Z, [x26]\n"
- ".inst 0x808c0080 // fmopa za0.s, p0/M, p0/M, z4.s, z12.s\n"
- ".inst 0xa140c6f3 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x23]\n"
- ".inst 0x808d0081 // fmopa za1.s, p0/M, p0/M, z4.s, z13.s\n"
- ".inst 0x808e0082 // fmopa za2.s, p0/M, p0/M, z4.s, z14.s\n"
- ".inst 0x808f0083 // fmopa za3.s, p0/M, p0/M, z4.s, z15.s\n"
- "ld1w { z4.s }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0x809203a0 // fmopa za0.s, p0/M, p0/M, z29.s, z18.s\n"
- ".inst 0xa041c6ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x809603a1 // fmopa za1.s, p0/M, p0/M, z29.s, z22.s\n"
- ".inst 0x809a03a2 // fmopa za2.s, p0/M, p0/M, z29.s, z26.s\n"
- ".inst 0x809e03a3 // fmopa za3.s, p0/M, p0/M, z29.s, z30.s\n"
- "ld1w { z29.s }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142c6f2 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0x80880040 // fmopa za0.s, p0/M, p0/M, z2.s, z8.s\n"
- ".inst 0x80890041 // fmopa za1.s, p0/M, p0/M, z2.s, z9.s\n"
- ".inst 0x808a0042 // fmopa za2.s, p0/M, p0/M, z2.s, z10.s\n"
- ".inst 0x808b0043 // fmopa za3.s, p0/M, p0/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p0/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa043c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
- "addvl x23, x23, #16\n"
+ ".inst 0xa143c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+ "addvl x21, x21, #16\n"
"bgt 6b\n"
"7:" // K loop tail
+ ".inst 0x80880380 // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
+ ".inst 0x80890381 // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+ ".inst 0x808a0382 // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+ ".inst 0x808b0383 // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+ ".inst 0x808c02c0 // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+ ".inst 0x808d02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+ ".inst 0x808e02c2 // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+ ".inst 0x808f02c3 // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+ ".inst 0x808403c0 // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+ ".inst 0x808503c1 // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+ ".inst 0x808603c2 // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+ ".inst 0x808703c3 // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
".inst 0x80930280 // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
".inst 0x80970281 // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
".inst 0x809b0282 // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
".inst 0x809f0283 // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
- ".inst 0x808c0080 // fmopa za0.s, p0/M, p0/M, z4.s, z12.s\n"
- ".inst 0x808d0081 // fmopa za1.s, p0/M, p0/M, z4.s, z13.s\n"
- ".inst 0x808e0082 // fmopa za2.s, p0/M, p0/M, z4.s, z14.s\n"
- ".inst 0x808f0083 // fmopa za3.s, p0/M, p0/M, z4.s, z15.s\n"
- ".inst 0x809203a0 // fmopa za0.s, p0/M, p0/M, z29.s, z18.s\n"
- ".inst 0x809603a1 // fmopa za1.s, p0/M, p0/M, z29.s, z22.s\n"
- ".inst 0x809a03a2 // fmopa za2.s, p0/M, p0/M, z29.s, z26.s\n"
- ".inst 0x809e03a3 // fmopa za3.s, p0/M, p0/M, z29.s, z30.s\n"
- ".inst 0x80880040 // fmopa za0.s, p0/M, p0/M, z2.s, z8.s\n"
- ".inst 0x80890041 // fmopa za1.s, p0/M, p0/M, z2.s, z9.s\n"
- ".inst 0x808a0042 // fmopa za2.s, p0/M, p0/M, z2.s, z10.s\n"
- ".inst 0x808b0043 // fmopa za3.s, p0/M, p0/M, z2.s, z11.s\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x22, 10f\n"
"9:" // K oddments: Loop
- "ld1w { z26.s }, p0/Z, [x26]\n"
- "subs x20, x20, #0x1\n"
+ "ld1w { z8.s }, p0/Z, [x26]\n"
+ "subs x22, x22, #0x1\n"
"addvl x26, x26, #1\n"
- ".inst 0xa140c6e3 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x23]\n"
- "addvl x23, x23, #4\n"
- ".inst 0x80830340 // fmopa za0.s, p0/M, p0/M, z26.s, z3.s\n"
- ".inst 0x80870341 // fmopa za1.s, p0/M, p0/M, z26.s, z7.s\n"
- ".inst 0x808b0342 // fmopa za2.s, p0/M, p0/M, z26.s, z11.s\n"
- ".inst 0x808f0343 // fmopa za3.s, p0/M, p0/M, z26.s, z15.s\n"
+ ".inst 0xa140c6a3 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21]\n"
+ "addvl x21, x21, #4\n"
+ ".inst 0x80830100 // fmopa za0.s, p0/M, p0/M, z8.s, z3.s\n"
+ ".inst 0x80870101 // fmopa za1.s, p0/M, p0/M, z8.s, z7.s\n"
+ ".inst 0x808b0102 // fmopa za2.s, p0/M, p0/M, z8.s, z11.s\n"
+ ".inst 0x808f0103 // fmopa za3.s, p0/M, p0/M, z8.s, z15.s\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x15, #1, 14f\n"
@@ -239,25 +239,25 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xa040c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xa041c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa061c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -265,31 +265,31 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"cmp x12, x20\n"
".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 13b\n"
"b 24f\n"
"14:" // Store to output array
"ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
"sub x24, x11, x10\n"
"ldr x23, [%x[args], %[offsetof_ldcb]]\n"
- "add x25, x25, x9, LSL #2\n" // C += n
"madd x25, x10, x23, x25\n" // C += m * ldc
"tbz x15, #2, 18f\n"
"cntw x20\n"
- "mov x12, #0x0\n"
"cmp x24, x20\n"
"csel x22, x24, x20, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
@@ -299,30 +299,30 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
"add x25, x25, x23\n"
- "add x12, x12, #0x4\n"
".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
"add x25, x25, x23\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
"add x25, x25, x23\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa160c323 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x24, x24, x22\n"
@@ -330,29 +330,29 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"b 22f\n"
"18:" // Store to output array: Skip activation: End
"cntw x20\n"
- "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "mov x12, #0x0\n"
"cmp x24, x20\n"
- "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x20, x24, x20, LT\n"
"lsr x21, x20, #0x2\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 0 loop
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
- "add x12, x12, #0x4\n"
- "cmp x12, x21, LSL #2\n"
".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
".inst 0xa160c333 // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
@@ -362,13 +362,13 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"cbz x20, 21f\n"
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ "subs x20, x20, #0x1\n"
".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 21f\n"
@@ -383,25 +383,25 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"23:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14]\n"
- ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x14, x14, #16\n"
"blt 23b\n"
"24:" // End block
"incw x9, ALL, MUL #4\n"
"cmp x9, x28\n"
"blt 3b\n"
"incw x10\n"
- "mov x9, #0x0\n"
"cmp x10, x11\n"
+ "mov x9, #0x0\n"
"mov x27, x26\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
index 275399748a..ed54e70e28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
class cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL
{
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 1> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 1> transforms = {};
cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
index 45fcc7a860..1e46aee27a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
B(B), kstride_bytes(K * sizeof(float)),
C(C), ldcb(ldc * sizeof(float)),
M(M), N(N), K(K),
+ n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
min(-std::numeric_limits<float>::infinity()),
max(std::numeric_limits<float>::infinity()),
bias(bias),
@@ -87,13 +88,12 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
const long kstride_bytes;
float *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();
const float *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -112,17 +112,17 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -137,101 +137,101 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z17.s, #1.0\n"
- ".inst 0xa00a428a // ld1w { z10.s-z11.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0x808a0220 // fmopa za0.s, p0/M, p0/M, z17.s, z10.s\n"
- ".inst 0x808b0221 // fmopa za1.s, p0/M, p0/M, z17.s, z11.s\n"
- ".inst 0x808a0222 // fmopa za2.s, p0/M, p0/M, z17.s, z10.s\n"
- ".inst 0x808b0223 // fmopa za3.s, p0/M, p0/M, z17.s, z11.s\n"
+ "fmov z12.s, #1.0\n"
+ ".inst 0xa10a4289 // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x80810180 // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890181 // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+ ".inst 0x80810182 // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890183 // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20, ALL, MUL #2\n"
"incw x21, ALL, MUL #2\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
- "lsr x21, x20, #0x2\n"
- "and x20, x20, #0x3\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa0404776 // ld1w { z22.s-z23.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa14046e7 // ld1w { z7.s, z15.s }, pn9.b/Z, [x23]\n"
- ".inst 0xa1414766 // ld1w { z6.s, z14.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04146f4 // ld1w { z20.s-z21.s }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa1424762 // ld1w { z2.s, z10.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14246e3 // ld1w { z3.s, z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1434761 // ld1w { z1.s, z9.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ "lsr x23, x20, #0x2\n"
+ "and x22, x20, #0x3\n"
+ "ldr x21, [%x[args], %[offsetof_B]]\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x21, x10, x20, x21\n" // bptr = B + n * kstride_bytes
+ "cbz x23, 8f\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xa0404772 // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa04046a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+ ".inst 0xa0414764 // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04146bb // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa042476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04246b5 // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0434766 // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04346e4 // ld1w { z4.s-z5.s }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
- "addvl x23, x23, #8\n"
+ ".inst 0xa04346a9 // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+ "addvl x21, x21, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x808702c0 // fmopa za0.s, p0/M, p0/M, z22.s, z7.s\n"
- "subs x21, x21, #0x1\n"
- ".inst 0x808f02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z15.s\n"
- ".inst 0x808702e2 // fmopa za2.s, p0/M, p0/M, z23.s, z7.s\n"
- ".inst 0x808f02e3 // fmopa za3.s, p0/M, p0/M, z23.s, z15.s\n"
- ".inst 0xa0404776 // ld1w { z22.s-z23.s }, pn9.b/Z, [x27]\n"
- ".inst 0x809400c0 // fmopa za0.s, p0/M, p0/M, z6.s, z20.s\n"
- ".inst 0xa14046e7 // ld1w { z7.s, z15.s }, pn9.b/Z, [x23]\n"
- ".inst 0x809500c1 // fmopa za1.s, p0/M, p0/M, z6.s, z21.s\n"
- ".inst 0x809401c2 // fmopa za2.s, p0/M, p0/M, z14.s, z20.s\n"
- ".inst 0x809501c3 // fmopa za3.s, p0/M, p0/M, z14.s, z21.s\n"
- ".inst 0xa1414766 // ld1w { z6.s, z14.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0x80830040 // fmopa za0.s, p0/M, p0/M, z2.s, z3.s\n"
- ".inst 0xa04146f4 // ld1w { z20.s-z21.s }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0x808b0041 // fmopa za1.s, p0/M, p0/M, z2.s, z11.s\n"
- ".inst 0x80830142 // fmopa za2.s, p0/M, p0/M, z10.s, z3.s\n"
- ".inst 0x808b0143 // fmopa za3.s, p0/M, p0/M, z10.s, z11.s\n"
- ".inst 0xa1424762 // ld1w { z2.s, z10.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14246e3 // ld1w { z3.s, z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x80840020 // fmopa za0.s, p0/M, p0/M, z1.s, z4.s\n"
- ".inst 0x80850021 // fmopa za1.s, p0/M, p0/M, z1.s, z5.s\n"
- ".inst 0x80840122 // fmopa za2.s, p0/M, p0/M, z9.s, z4.s\n"
- ".inst 0x80850123 // fmopa za3.s, p0/M, p0/M, z9.s, z5.s\n"
- ".inst 0xa1434761 // ld1w { z1.s, z9.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0x80820240 // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x80830241 // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+ ".inst 0x80820262 // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+ ".inst 0x80830263 // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+ ".inst 0xa0404772 // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+ ".inst 0x809a0080 // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+ ".inst 0xa04046a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+ ".inst 0x809b0081 // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+ ".inst 0x809a00a2 // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+ ".inst 0x809b00a3 // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+ ".inst 0xa0414764 // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0x80940140 // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+ ".inst 0xa04146bb // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0x80950141 // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+ ".inst 0x80940162 // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+ ".inst 0x80950163 // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+ ".inst 0xa042476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04246b5 // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0x808800c0 // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+ ".inst 0x808900c1 // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+ ".inst 0x808800e2 // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+ ".inst 0x808900e3 // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
+ ".inst 0xa0434766 // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04346e4 // ld1w { z4.s-z5.s }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
- "addvl x23, x23, #8\n"
+ ".inst 0xa04346a9 // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+ "addvl x21, x21, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x808702c0 // fmopa za0.s, p0/M, p0/M, z22.s, z7.s\n"
- ".inst 0x808f02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z15.s\n"
- ".inst 0x808702e2 // fmopa za2.s, p0/M, p0/M, z23.s, z7.s\n"
- ".inst 0x808f02e3 // fmopa za3.s, p0/M, p0/M, z23.s, z15.s\n"
- ".inst 0x809400c0 // fmopa za0.s, p0/M, p0/M, z6.s, z20.s\n"
- ".inst 0x809500c1 // fmopa za1.s, p0/M, p0/M, z6.s, z21.s\n"
- ".inst 0x809401c2 // fmopa za2.s, p0/M, p0/M, z14.s, z20.s\n"
- ".inst 0x809501c3 // fmopa za3.s, p0/M, p0/M, z14.s, z21.s\n"
- ".inst 0x80830040 // fmopa za0.s, p0/M, p0/M, z2.s, z3.s\n"
- ".inst 0x808b0041 // fmopa za1.s, p0/M, p0/M, z2.s, z11.s\n"
- ".inst 0x80830142 // fmopa za2.s, p0/M, p0/M, z10.s, z3.s\n"
- ".inst 0x808b0143 // fmopa za3.s, p0/M, p0/M, z10.s, z11.s\n"
- ".inst 0x80840020 // fmopa za0.s, p0/M, p0/M, z1.s, z4.s\n"
- ".inst 0x80850021 // fmopa za1.s, p0/M, p0/M, z1.s, z5.s\n"
- ".inst 0x80840122 // fmopa za2.s, p0/M, p0/M, z9.s, z4.s\n"
- ".inst 0x80850123 // fmopa za3.s, p0/M, p0/M, z9.s, z5.s\n"
+ ".inst 0x80820240 // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
+ ".inst 0x80830241 // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+ ".inst 0x80820262 // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+ ".inst 0x80830263 // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+ ".inst 0x809a0080 // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+ ".inst 0x809b0081 // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+ ".inst 0x809a00a2 // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+ ".inst 0x809b00a3 // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+ ".inst 0x80940140 // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+ ".inst 0x80950141 // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+ ".inst 0x80940162 // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+ ".inst 0x80950163 // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+ ".inst 0x808800c0 // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+ ".inst 0x808900c1 // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+ ".inst 0x808800e2 // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+ ".inst 0x808900e3 // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x22, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa040477e // ld1w { z30.s-z31.s }, pn9.b/Z, [x27]\n"
+ "subs x22, x22, #0x1\n"
"addvl x27, x27, #2\n"
- ".inst 0xa04046ee // ld1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
- "addvl x23, x23, #2\n"
- ".inst 0x808e0140 // fmopa za0.s, p0/M, p0/M, z10.s, z14.s\n"
- ".inst 0x808f0141 // fmopa za1.s, p0/M, p0/M, z10.s, z15.s\n"
- ".inst 0x808e0162 // fmopa za2.s, p0/M, p0/M, z11.s, z14.s\n"
- ".inst 0x808f0163 // fmopa za3.s, p0/M, p0/M, z11.s, z15.s\n"
+ ".inst 0xa14046a5 // ld1w { z5.s, z13.s }, pn9.b/Z, [x21]\n"
+ "addvl x21, x21, #2\n"
+ ".inst 0x808503c0 // fmopa za0.s, p0/M, p0/M, z30.s, z5.s\n"
+ ".inst 0x808d03c1 // fmopa za1.s, p0/M, p0/M, z30.s, z13.s\n"
+ ".inst 0x808503e2 // fmopa za2.s, p0/M, p0/M, z31.s, z5.s\n"
+ ".inst 0x808d03e3 // fmopa za3.s, p0/M, p0/M, z31.s, z13.s\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -239,24 +239,24 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa060c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa061c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
@@ -265,31 +265,31 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10, LSL #2\n" // C += n
"sub x25, x13, x11\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
- "add x26, x26, x10, LSL #2\n" // C += n
"madd x26, x11, x24, x26\n" // C += m * ldc
"tbz x16, #2, 21f\n"
"cntw x23\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
@@ -297,36 +297,36 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "add x12, x12, #0x4\n"
".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 21f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Skip activation: Accumulator row 1 loop
@@ -334,28 +334,28 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "add x12, x12, #0x4\n"
".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 18b\n"
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -363,37 +363,37 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"b 28f\n"
"21:" // Store to output array: Skip activation: End
"cntw x23\n"
- "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
- "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
- "add x12, x12, #0x4\n"
- "cmp x12, x21, LSL #2\n"
- ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
+ ".inst 0xa1604357 // st1w { z23.s, z31.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 24f\n"
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
@@ -407,34 +407,34 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"subs x25, x25, x22\n"
"beq 28f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 26f\n"
"25:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
- "add x12, x12, #0x4\n"
- "cmp x12, x21, LSL #2\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604343 // st1w { z3.s, z11.s }, p8, [x26]\n"
+ ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 25b\n"
"26:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 27f\n"
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
@@ -449,25 +449,25 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 29b\n"
"30:" // End block
"incw x10, ALL, MUL #2\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #2\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
index 453505a227..1348f00d37 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
class cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL
{
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 1> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 1> transforms = {};
cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
index 63f93b9b5b..a69e1f84e0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
B(B), kstride_bytes(K * sizeof(float)),
C(C), ldcb(ldc * sizeof(float)),
M(M), N(N), K(K),
+ n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
min(-std::numeric_limits<float>::infinity()),
max(std::numeric_limits<float>::infinity()),
bias(bias),
@@ -87,13 +88,12 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
const long kstride_bytes;
float *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();
const float *const bias;
-
float *const accumulator_buffer;
uint64_t flags;
};
@@ -112,17 +112,17 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -137,101 +137,101 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z6.s, #1.0\n"
- "ld1w { z26.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0x809a24c0 // fmopa za0.s, p1/M, p1/M, z6.s, z26.s\n"
- ".inst 0x809a24c1 // fmopa za1.s, p1/M, p1/M, z6.s, z26.s\n"
- ".inst 0x809a24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z26.s\n"
- ".inst 0x809a24c3 // fmopa za3.s, p1/M, p1/M, z6.s, z26.s\n"
+ "fmov z11.s, #1.0\n"
+ "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x808d2560 // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2561 // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2563 // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20\n"
"incw x21, ALL, MUL #4\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
- "lsr x21, x20, #0x2\n"
- "and x20, x20, #0x3\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa140c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
- "ld1w { z13.s }, p1/Z, [x23]\n"
- ".inst 0xa141c372 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ld1w { z21.s }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa142c373 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1w { z17.s }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa143c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ "lsr x23, x20, #0x2\n"
+ "and x22, x20, #0x3\n"
+ "ldr x21, [%x[args], %[offsetof_B]]\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x21, x10, x20, x21\n" // bptr = B + n * kstride_bytes
+ "cbz x23, 8f\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xa140c360 // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+ "ldnt1w { z19.s }, p1/Z, [x21]\n"
+ ".inst 0xa141c371 // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+ ".inst 0xa142c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xa143c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1w { z2.s }, p1/Z, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
+ "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x808d2460 // fmopa za0.s, p1/M, p1/M, z3.s, z13.s\n"
- "subs x21, x21, #0x1\n"
- ".inst 0x808d24e1 // fmopa za1.s, p1/M, p1/M, z7.s, z13.s\n"
- ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
- ".inst 0x808d25e3 // fmopa za3.s, p1/M, p1/M, z15.s, z13.s\n"
- ".inst 0xa140c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
- ".inst 0x80952640 // fmopa za0.s, p1/M, p1/M, z18.s, z21.s\n"
- "ld1w { z13.s }, p1/Z, [x23]\n"
- ".inst 0x809526c1 // fmopa za1.s, p1/M, p1/M, z22.s, z21.s\n"
- ".inst 0x80952742 // fmopa za2.s, p1/M, p1/M, z26.s, z21.s\n"
- ".inst 0x809527c3 // fmopa za3.s, p1/M, p1/M, z30.s, z21.s\n"
- ".inst 0xa141c372 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0x80912660 // fmopa za0.s, p1/M, p1/M, z19.s, z17.s\n"
- "ld1w { z21.s }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0x809126e1 // fmopa za1.s, p1/M, p1/M, z23.s, z17.s\n"
- ".inst 0x80912762 // fmopa za2.s, p1/M, p1/M, z27.s, z17.s\n"
- ".inst 0x809127e3 // fmopa za3.s, p1/M, p1/M, z31.s, z17.s\n"
- ".inst 0xa142c373 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1w { z17.s }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0x80822600 // fmopa za0.s, p1/M, p1/M, z16.s, z2.s\n"
- ".inst 0x80822681 // fmopa za1.s, p1/M, p1/M, z20.s, z2.s\n"
- ".inst 0x80822702 // fmopa za2.s, p1/M, p1/M, z24.s, z2.s\n"
- ".inst 0x80822783 // fmopa za3.s, p1/M, p1/M, z28.s, z2.s\n"
- ".inst 0xa143c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0x80932400 // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x80932481 // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+ ".inst 0x80932502 // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+ ".inst 0x80932583 // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+ ".inst 0xa140c360 // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+ ".inst 0x80962620 // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+ "ldnt1w { z19.s }, p1/Z, [x21]\n"
+ ".inst 0x809626a1 // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+ ".inst 0x80962722 // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+ ".inst 0x809627a3 // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+ ".inst 0xa141c371 // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0x80972600 // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+ "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+ ".inst 0x80972681 // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+ ".inst 0x80972702 // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+ ".inst 0x80972783 // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+ ".inst 0xa142c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+ ".inst 0x80822460 // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+ ".inst 0x808224e1 // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+ ".inst 0x80822562 // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+ ".inst 0x808225e3 // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
+ ".inst 0xa143c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1w { z2.s }, p1/Z, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
+ "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x808d2460 // fmopa za0.s, p1/M, p1/M, z3.s, z13.s\n"
- ".inst 0x808d24e1 // fmopa za1.s, p1/M, p1/M, z7.s, z13.s\n"
- ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
- ".inst 0x808d25e3 // fmopa za3.s, p1/M, p1/M, z15.s, z13.s\n"
- ".inst 0x80952640 // fmopa za0.s, p1/M, p1/M, z18.s, z21.s\n"
- ".inst 0x809526c1 // fmopa za1.s, p1/M, p1/M, z22.s, z21.s\n"
- ".inst 0x80952742 // fmopa za2.s, p1/M, p1/M, z26.s, z21.s\n"
- ".inst 0x809527c3 // fmopa za3.s, p1/M, p1/M, z30.s, z21.s\n"
- ".inst 0x80912660 // fmopa za0.s, p1/M, p1/M, z19.s, z17.s\n"
- ".inst 0x809126e1 // fmopa za1.s, p1/M, p1/M, z23.s, z17.s\n"
- ".inst 0x80912762 // fmopa za2.s, p1/M, p1/M, z27.s, z17.s\n"
- ".inst 0x809127e3 // fmopa za3.s, p1/M, p1/M, z31.s, z17.s\n"
- ".inst 0x80822600 // fmopa za0.s, p1/M, p1/M, z16.s, z2.s\n"
- ".inst 0x80822681 // fmopa za1.s, p1/M, p1/M, z20.s, z2.s\n"
- ".inst 0x80822702 // fmopa za2.s, p1/M, p1/M, z24.s, z2.s\n"
- ".inst 0x80822783 // fmopa za3.s, p1/M, p1/M, z28.s, z2.s\n"
+ ".inst 0x80932400 // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
+ ".inst 0x80932481 // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+ ".inst 0x80932502 // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+ ".inst 0x80932583 // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+ ".inst 0x80962620 // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+ ".inst 0x809626a1 // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+ ".inst 0x80962722 // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+ ".inst 0x809627a3 // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+ ".inst 0x80972600 // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+ ".inst 0x80972681 // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+ ".inst 0x80972702 // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+ ".inst 0x80972783 // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+ ".inst 0x80822460 // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+ ".inst 0x808224e1 // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+ ".inst 0x80822562 // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+ ".inst 0x808225e3 // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x22, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa140c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa140c373 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27]\n"
+ "subs x22, x22, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1w { z2.s }, p1/Z, [x23]\n"
- "addvl x23, x23, #1\n"
- ".inst 0x80822600 // fmopa za0.s, p1/M, p1/M, z16.s, z2.s\n"
- ".inst 0x80822681 // fmopa za1.s, p1/M, p1/M, z20.s, z2.s\n"
- ".inst 0x80822702 // fmopa za2.s, p1/M, p1/M, z24.s, z2.s\n"
- ".inst 0x80822783 // fmopa za3.s, p1/M, p1/M, z28.s, z2.s\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
+ "addvl x21, x21, #1\n"
+ ".inst 0x808b2660 // fmopa za0.s, p1/M, p1/M, z19.s, z11.s\n"
+ ".inst 0x808b26e1 // fmopa za1.s, p1/M, p1/M, z23.s, z11.s\n"
+ ".inst 0x808b2762 // fmopa za2.s, p1/M, p1/M, z27.s, z11.s\n"
+ ".inst 0x808b27e3 // fmopa za3.s, p1/M, p1/M, z31.s, z11.s\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -239,25 +239,25 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa060c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 42f\n"
@@ -265,148 +265,148 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1cc // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 42f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10, LSL #2\n" // C += n
"sub x25, x13, x11\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
- "add x26, x26, x10, LSL #2\n" // C += n
"madd x26, x11, x24, x26\n" // C += m * ldc
"tbz x16, #2, 27f\n"
"cntw x23\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z15.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- "st1w { z0.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- "st1w { z1.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
- "st1w { z2.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Skip activation: Accumulator row 1 loop
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ "st1w { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z15.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 18b\n"
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ "st1w { z24.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z26.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Skip activation: Accumulator row 2 loop
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z8.s }, p0, [x26]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z9.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z10.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z11.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 21b\n"
"22:" // Store to output array: Skip activation: Accumulator row 2 oddments
"cbz x20, 23f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ "st1w { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z14.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Skip activation: Accumulator row 3 loop
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- "add x12, x12, #0x4\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 24b\n"
@@ -429,63 +429,63 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"b 40f\n"
"27:" // Store to output array: Skip activation: End
"cntw x23\n"
- "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
- "mov x12, #0x0\n"
"cmp x25, x23\n"
- "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 29f\n"
"28:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 28b\n"
"29:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 30f\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"30:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 40f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 32f\n"
"31:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "add x12, x12, #0x4\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 31b\n"
"32:" // Store to output array: Accumulator row 1 oddments
@@ -506,100 +506,100 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"subs x25, x25, x22\n"
"beq 40f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 35f\n"
"34:" // Store to output array: Accumulator row 2 loop
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "add x12, x12, #0x4\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 34b\n"
"35:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 36f\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
- "st1w { z28.s }, p0, [x26]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 36f\n"
"subs x20, x20, #0x1\n"
- "st1w { z29.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 36f\n"
- "st1w { z30.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"36:" // Store to output array: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 40f\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 38f\n"
"37:" // Store to output array: Accumulator row 3 loop
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- "add x12, x12, #0x4\n"
".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 37b\n"
"38:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 39f\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 39f\n"
"subs x20, x20, #0x1\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 39f\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"39:" // Store to output array: Accumulator row 3 oddments: End
"40:" // Store to output array: End
"tbz x16, #0, 42f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"41:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 41b\n"
"42:" // End block
"incw x10\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #4\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
index b70bef3bbe..a4b14325f2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
class cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int8_t result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
@@ -61,7 +60,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 4, true> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
index 56d1a13a72..c8d56dc5e5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
@@ -49,7 +49,7 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
C(C), ldcb(ldc * sizeof(int8_t)),
M(M), N(N), K(K),
- min(0), max(0),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias), n_0(n_0),
accumulator_buffer(accumulator_buffer),
@@ -74,14 +74,13 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
const long kstride_bytes;
int8_t *const C;
const long ldcb;
- const long M, N, K;
- int32_t min;
- int32_t max;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<int8_t>::min();
+ int32_t max = std::numeric_limits<int8_t>::max();
const int32_t *const bias;
const int n_0;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -90,131 +89,131 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ "ldr x14, [%x[args], %[offsetof_flags]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"ptrue p1.b\n"
".inst 0x25207811 // ptrue pn9.b\n"
- "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
"ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
- "tbz x15, #0, 2f\n"
+ "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x14, #0, 2f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14]\n"
- ".inst 0xa041c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x13, x13, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
- "ldr w11, [%x[args], %[offsetof_M]]\n"
- "mov x10, #0x0\n"
+ "ldr w10, [%x[args], %[offsetof_M]]\n"
"mov x9, #0x0\n"
- "ldr w28, [%x[args], %[offsetof_N]]\n"
- "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "mov x28, #0x0\n"
+ "ldr w27, [%x[args], %[offsetof_N]]\n"
+ "ldr x26, [%x[args], %[offsetof_A]]\n"
"3:" // M and N loop
- "mov x26, x27\n"
- ".inst 0x25bc6530 // whilelt pn8.s, x9, x28, VLx4\n"
- "tbnz x15, #0, 4f\n"
+ "mov x25, x26\n"
+ ".inst 0x25bb6790 // whilelt pn8.s, x28, x27, VLx4\n"
+ "tbnz x14, #0, 4f\n"
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa009c290 // ld1w { z16.s-z19.s }, p8/Z, [x20, x9, LSL #2]\n"
- ".inst 0xc0902600 // addha za0.s, p1/M, p1/M, z16.s\n"
- ".inst 0xc0902621 // addha za1.s, p1/M, p1/M, z17.s\n"
- ".inst 0xc0902642 // addha za2.s, p1/M, p1/M, z18.s\n"
- ".inst 0xc0902663 // addha za3.s, p1/M, p1/M, z19.s\n"
+ ".inst 0xa11cc289 // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+ ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n"
+ ".inst 0xc09024a1 // addha za1.s, p1/M, p1/M, z5.s\n"
+ ".inst 0xc0902522 // addha za2.s, p1/M, p1/M, z9.s\n"
+ ".inst 0xc09025a3 // addha za3.s, p1/M, p1/M, z13.s\n"
"4:" // Prepare accumulators: Test for last block
- "mov x20, x9\n"
- "mov x21, x10\n"
+ "mov x20, x28\n"
+ "mov x21, x9\n"
"incw x20, ALL, MUL #4\n"
"incw x21\n"
- "cmp x20, x28\n"
- "mov x20, x15\n"
- "csel x21, x10, x21, LT\n"
- "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
- "cmp x21, x11\n"
- "csel x15, x20, x15, LT\n"
+ "cmp x20, x27\n"
+ "csel x21, x9, x21, LT\n"
+ "mov x20, x14\n"
+ "bfm x14, XZR, #0x0, #0x0 // bfc x14, #0x0, #0x1\n"
+ "cmp x21, x10\n"
+ "csel x14, x20, x14, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x9, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- "ld1b { z5.b }, p1/Z, [x26]\n"
- ".inst 0xa14086e0 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23]\n"
- "ld1b { z31.b }, p1/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa14186f2 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1b { z1.b }, p1/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa14286f0 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1b { z6.b }, p1/Z, [x26, #3, MUL VL]\n"
- "addvl x26, x26, #4\n"
- ".inst 0xa14386e3 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x28, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa08024a0 // smopa za0.s, p1/M, p1/M, z5.b, z0.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa08424a1 // smopa za1.s, p1/M, p1/M, z5.b, z4.b\n"
- ".inst 0xa08824a2 // smopa za2.s, p1/M, p1/M, z5.b, z8.b\n"
- ".inst 0xa08c24a3 // smopa za3.s, p1/M, p1/M, z5.b, z12.b\n"
- "ld1b { z5.b }, p1/Z, [x26]\n"
- ".inst 0xa09227e0 // smopa za0.s, p1/M, p1/M, z31.b, z18.b\n"
- ".inst 0xa14086e0 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa09627e1 // smopa za1.s, p1/M, p1/M, z31.b, z22.b\n"
- ".inst 0xa09a27e2 // smopa za2.s, p1/M, p1/M, z31.b, z26.b\n"
- ".inst 0xa09e27e3 // smopa za3.s, p1/M, p1/M, z31.b, z30.b\n"
- "ld1b { z31.b }, p1/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa0902420 // smopa za0.s, p1/M, p1/M, z1.b, z16.b\n"
- ".inst 0xa14186f2 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0942421 // smopa za1.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa0982422 // smopa za2.s, p1/M, p1/M, z1.b, z24.b\n"
- ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- "ld1b { z1.b }, p1/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa14286f0 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0xa08324c0 // smopa za0.s, p1/M, p1/M, z6.b, z3.b\n"
- ".inst 0xa08724c1 // smopa za1.s, p1/M, p1/M, z6.b, z7.b\n"
- ".inst 0xa08b24c2 // smopa za2.s, p1/M, p1/M, z6.b, z11.b\n"
- ".inst 0xa08f24c3 // smopa za3.s, p1/M, p1/M, z6.b, z15.b\n"
- "ld1b { z6.b }, p1/Z, [x26, #3, MUL VL]\n"
- "addvl x26, x26, #4\n"
- ".inst 0xa14386e3 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa0842680 // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0862682 // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa0872683 // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa0982560 // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0992561 // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa09a2562 // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa09b2563 // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa09c2440 // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa09d2441 // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa09e2442 // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa09f2443 // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xa09025c0 // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa09125c1 // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa09225c2 // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa08024a0 // smopa za0.s, p1/M, p1/M, z5.b, z0.b\n"
- ".inst 0xa08424a1 // smopa za1.s, p1/M, p1/M, z5.b, z4.b\n"
- ".inst 0xa08824a2 // smopa za2.s, p1/M, p1/M, z5.b, z8.b\n"
- ".inst 0xa08c24a3 // smopa za3.s, p1/M, p1/M, z5.b, z12.b\n"
- ".inst 0xa09227e0 // smopa za0.s, p1/M, p1/M, z31.b, z18.b\n"
- ".inst 0xa09627e1 // smopa za1.s, p1/M, p1/M, z31.b, z22.b\n"
- ".inst 0xa09a27e2 // smopa za2.s, p1/M, p1/M, z31.b, z26.b\n"
- ".inst 0xa09e27e3 // smopa za3.s, p1/M, p1/M, z31.b, z30.b\n"
- ".inst 0xa0902420 // smopa za0.s, p1/M, p1/M, z1.b, z16.b\n"
- ".inst 0xa0942421 // smopa za1.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa0982422 // smopa za2.s, p1/M, p1/M, z1.b, z24.b\n"
- ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- ".inst 0xa08324c0 // smopa za0.s, p1/M, p1/M, z6.b, z3.b\n"
- ".inst 0xa08724c1 // smopa za1.s, p1/M, p1/M, z6.b, z7.b\n"
- ".inst 0xa08b24c2 // smopa za2.s, p1/M, p1/M, z6.b, z11.b\n"
- ".inst 0xa08f24c3 // smopa za3.s, p1/M, p1/M, z6.b, z15.b\n"
+ ".inst 0xa0842680 // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0862682 // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa0872683 // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ ".inst 0xa0982560 // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa0992561 // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa09a2562 // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa09b2563 // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ ".inst 0xa09c2440 // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa09d2441 // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa09e2442 // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa09f2443 // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ ".inst 0xa09025c0 // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa09125c1 // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa09225c2 // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1b { z16.b }, p1/Z, [x26]\n"
- "subs x20, x20, #0x1\n"
- "addvl x26, x26, #1\n"
+ "ld1b { z16.b }, p1/Z, [x25]\n"
+ "subs x21, x21, #0x1\n"
+ "addvl x25, x25, #1\n"
".inst 0xa04086e4 // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
".inst 0xa0842600 // smopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
@@ -223,182 +222,182 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
".inst 0xa0872603 // smopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- "ld1w { z15.s }, p1/Z, [x26]\n"
- "addvl x26, x26, #1\n"
+ "ld1w { z15.s }, p1/Z, [x25]\n"
+ "addvl x25, x25, #1\n"
".inst 0xc09125e0 // addva za0.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e1 // addva za1.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
- "tbz x15, #1, 14f\n"
- "tbz x15, #0, 12f\n"
+ "tbz x14, #1, 14f\n"
+ "tbz x14, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa040c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa060c5bc // st1w { z28.s-z31.s }, pn9.b, [x13]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa061c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
"addvl x13, x13, #16\n"
+ ".inst 0xa061c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa062c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
"blt 11b\n"
"b 21f\n"
"12:" // Store to partial result buffer: Store only
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa060c564 // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
- "addvl x13, x13, #16\n"
+ ".inst 0xa062c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
"blt 13b\n"
"b 21f\n"
"14:" // Store to output array
- "ldr x25, [%x[args], %[offsetof_C]]\n"
- "sub x24, x11, x10\n"
+ "ldr x24, [%x[args], %[offsetof_C]]\n"
+ "add x24, x24, x28\n" // C += n
+ "sub x23, x10, x9\n"
"ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x24, x9, x22, x24\n" // C += m * ldc
"ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "add x25, x25, x9\n" // C += n
"ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "madd x25, x10, x23, x25\n" // C += m * ldc
- "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
"ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
- "tbz x15, #2, 15f\n"
- "ldr w22, [%x[args], %[offsetof_n_0]]\n"
- "ldr x21, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "tbz x14, #2, 15f\n"
+ "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+ "add x21, x21, x28\n"
+ "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "add x22, x22, x9\n"
- "add x21, x21, x22, LSL #2\n"
- "add x20, x20, x22, LSL #2\n"
- ".inst 0xa040c2a4 // ld1w { z4.s-z7.s }, p8/Z, [x21]\n"
- ".inst 0xa040c280 // ld1w { z0.s-z3.s }, p8/Z, [x20]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x20\n"
- "whilelt p0.b, x9, x28\n"
- "cmp x24, x20\n"
- "mov x12, #0x0\n"
- "csel x20, x24, x20, LT\n"
+ "whilelt p0.b, x28, x27\n"
+ "cmp x23, x20\n"
+ "csel x20, x23, x20, LT\n"
"lsr x21, x20, #0x1\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x1\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860010 // mova { z16.s-z17.s }, za0h.s[x12, 0:1]\n"
- ".inst 0xc086005e // mova { z30.s-z31.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
- ".inst 0xc08600cc // mova { z12.s-z13.s }, za3h.s[x12, 0:1]\n"
- ".inst 0xc1a4a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z4.s\n"
- ".inst 0xc1a5a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z5.s\n"
+ ".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+ ".inst 0xc1a4a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+ ".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1a5a41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+ ".inst 0xc1a6a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
"add x12, x12, #0x2\n"
- ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
"cmp x12, x21, LSL #1\n"
- ".inst 0xc1a7a40c // sqdmulh { z12.s-z13.s }, { z12.s-z13.s }, z7.s\n"
- ".inst 0xc1a0a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
- ".inst 0xc1a1a23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z1.s\n"
- ".inst 0xc1a2a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z2.s\n"
- ".inst 0xc1a3a22c // srshl { z12.s-z13.s }, { z12.s-z13.s }, z3.s\n"
- ".inst 0xc1a8a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z8.s\n"
- ".inst 0xc1a8a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z8.s\n"
- ".inst 0xc1a8a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z8.s\n"
- ".inst 0xc1a8a30c // add { z12.s-z13.s }, { z12.s-z13.s }, z8.s\n"
- ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+ ".inst 0xc1a7a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+ ".inst 0xc1aca23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc1ada23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+ ".inst 0xc1aea236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ ".inst 0xc1afa230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+ ".inst 0xc1a0a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+ ".inst 0xc1a0a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6ac // sclamp { z12.s-z13.s }, z21.s, z20.s\n"
- "uzp1 z19.b, z16.b, z30.b\n"
- "uzp1 z18.b, z17.b, z31.b\n"
- "uzp1 z17.b, z26.b, z12.b\n"
- "uzp1 z16.b, z27.b, z13.b\n"
- "uzp1 z17.b, z19.b, z17.b\n"
- "uzp1 z16.b, z18.b, z16.b\n"
- "st1b { z17.b }, p0, [x25]\n"
- "add x25, x25, x23\n"
- "st1b { z16.b }, p0, [x25]\n"
- "add x25, x25, x23\n"
+ ".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+ "uzp1 z19.b, z26.b, z28.b\n"
+ ".inst 0xc1b4c6b6 // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z22.b, z16.b\n"
+ "uzp1 z18.b, z27.b, z29.b\n"
+ "uzp1 z17.b, z23.b, z17.b\n"
+ "uzp1 z16.b, z19.b, z16.b\n"
+ "st1b { z16.b }, p0, [x24]\n"
+ "add x24, x24, x22\n"
+ "uzp1 z16.b, z18.b, z17.b\n"
+ "st1b { z16.b }, p0, [x24]\n"
+ "add x24, x24, x22\n"
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
".inst 0xc086000a // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
- ".inst 0xc086005a // mova { z26.s-z27.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc086008e // mova { z14.s-z15.s }, za2h.s[x12, 0:1]\n"
- ".inst 0xc08600d6 // mova { z22.s-z23.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
".inst 0xc1a4a40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
- ".inst 0xc1a5a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z5.s\n"
- ".inst 0xc1a6a40e // sqdmulh { z14.s-z15.s }, { z14.s-z15.s }, z6.s\n"
- ".inst 0xc1a7a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z7.s\n"
- ".inst 0xc1a0a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
- ".inst 0xc1a1a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
- ".inst 0xc1a2a22e // srshl { z14.s-z15.s }, { z14.s-z15.s }, z2.s\n"
- ".inst 0xc1a3a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z3.s\n"
- ".inst 0xc1a8a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z8.s\n"
- ".inst 0xc1a8a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z8.s\n"
- ".inst 0xc1a8a30e // add { z14.s-z15.s }, { z14.s-z15.s }, z8.s\n"
- ".inst 0xc1a8a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z8.s\n"
+ ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600de // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1a5a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+ ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+ ".inst 0xc1a7a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+ ".inst 0xc1aca22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+ ".inst 0xc1ada238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+ ".inst 0xc1aea23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+ ".inst 0xc1afa23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+ ".inst 0xc1a0a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+ ".inst 0xc1a0a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+ "uzp1 z17.b, z10.b, z24.b\n"
".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6ae // sclamp { z14.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6b6 // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
- "uzp1 z17.b, z10.b, z26.b\n"
- "uzp1 z16.b, z14.b, z22.b\n"
+ ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z26.b, z30.b\n"
"uzp1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [x25]\n"
+ "st1b { z16.b }, p0, [x24]\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"19:" // Store to output array: End
- "tbz x15, #0, 21f\n"
+ "tbz x14, #0, 21f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"20:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14]\n"
- ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x13, x13, #16\n"
"blt 20b\n"
"21:" // End block
- "incw x9, ALL, MUL #4\n"
- "cmp x9, x28\n"
+ "incw x28, ALL, MUL #4\n"
+ "cmp x28, x27\n"
"blt 3b\n"
- "incw x10\n"
- "mov x9, #0x0\n"
- "cmp x10, x11\n"
- "mov x27, x26\n"
+ "incw x9\n"
+ "cmp x9, x10\n"
+ "mov x28, #0x0\n"
+ "mov x26, x25\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
index 68b43328a2..b897efe0dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
class cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int8_t result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
@@ -61,7 +60,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 4, true> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
index 8831a224ad..b60573898a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
@@ -49,7 +49,7 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
C(C), ldcb(ldc * sizeof(int8_t)),
M(M), N(N), K(K),
- min(0), max(0),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias), n_0(n_0),
accumulator_buffer(accumulator_buffer),
@@ -74,14 +74,13 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
const long kstride_bytes;
int8_t *const C;
const long ldcb;
- const long M, N, K;
- int32_t min;
- int32_t max;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<int8_t>::min();
+ int32_t max = std::numeric_limits<int8_t>::max();
const int32_t *const bias;
const int n_0;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -100,17 +99,17 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -125,108 +124,108 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa10a4294 // ld1w { z20.s, z28.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc0902781 // addha za1.s, p1/M, p1/M, z28.s\n"
- ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc0902783 // addha za3.s, p1/M, p1/M, z28.s\n"
+ ".inst 0xa00a4299 // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
+ ".inst 0xc0902702 // addha za2.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902723 // addha za3.s, p1/M, p1/M, z25.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20, ALL, MUL #2\n"
"incw x21, ALL, MUL #2\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa0400778 // ld1b { z24.b-z25.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa14006f7 // ld1b { z23.b, z31.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1410776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04106fa // ld1b { z26.b-z27.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa0420766 // ld1b { z6.b-z7.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206e0 // ld1b { z0.b, z8.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa043077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04306ec // ld1b { z12.b-z13.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa09f2701 // smopa za1.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa0972722 // smopa za2.s, p1/M, p1/M, z25.b, z23.b\n"
- ".inst 0xa09f2723 // smopa za3.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa0400778 // ld1b { z24.b-z25.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa09a26c0 // smopa za0.s, p1/M, p1/M, z22.b, z26.b\n"
- ".inst 0xa14006f7 // ld1b { z23.b, z31.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa09b26c1 // smopa za1.s, p1/M, p1/M, z22.b, z27.b\n"
- ".inst 0xa09a27c2 // smopa za2.s, p1/M, p1/M, z30.b, z26.b\n"
- ".inst 0xa09b27c3 // smopa za3.s, p1/M, p1/M, z30.b, z27.b\n"
- ".inst 0xa1410776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa08024c0 // smopa za0.s, p1/M, p1/M, z6.b, z0.b\n"
- ".inst 0xa04106fa // ld1b { z26.b-z27.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa08824c1 // smopa za1.s, p1/M, p1/M, z6.b, z8.b\n"
- ".inst 0xa08024e2 // smopa za2.s, p1/M, p1/M, z7.b, z0.b\n"
- ".inst 0xa08824e3 // smopa za3.s, p1/M, p1/M, z7.b, z8.b\n"
- ".inst 0xa0420766 // ld1b { z6.b-z7.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206e0 // ld1b { z0.b, z8.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa08c2780 // smopa za0.s, p1/M, p1/M, z28.b, z12.b\n"
- ".inst 0xa08d2781 // smopa za1.s, p1/M, p1/M, z28.b, z13.b\n"
- ".inst 0xa08c27a2 // smopa za2.s, p1/M, p1/M, z29.b, z12.b\n"
- ".inst 0xa08d27a3 // smopa za3.s, p1/M, p1/M, z29.b, z13.b\n"
- ".inst 0xa043077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa0912460 // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0992461 // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa0912562 // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa0992563 // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa0962680 // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0972681 // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa0962782 // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa0972783 // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa09026a0 // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa09826a1 // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa09027a2 // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa09827a3 // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa08724a0 // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa08f24a1 // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa08725a2 // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa08f25a3 // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04306ec // ld1b { z12.b-z13.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
- ".inst 0xa09f2701 // smopa za1.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa0972722 // smopa za2.s, p1/M, p1/M, z25.b, z23.b\n"
- ".inst 0xa09f2723 // smopa za3.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa09a26c0 // smopa za0.s, p1/M, p1/M, z22.b, z26.b\n"
- ".inst 0xa09b26c1 // smopa za1.s, p1/M, p1/M, z22.b, z27.b\n"
- ".inst 0xa09a27c2 // smopa za2.s, p1/M, p1/M, z30.b, z26.b\n"
- ".inst 0xa09b27c3 // smopa za3.s, p1/M, p1/M, z30.b, z27.b\n"
- ".inst 0xa08024c0 // smopa za0.s, p1/M, p1/M, z6.b, z0.b\n"
- ".inst 0xa08824c1 // smopa za1.s, p1/M, p1/M, z6.b, z8.b\n"
- ".inst 0xa08024e2 // smopa za2.s, p1/M, p1/M, z7.b, z0.b\n"
- ".inst 0xa08824e3 // smopa za3.s, p1/M, p1/M, z7.b, z8.b\n"
- ".inst 0xa08c2780 // smopa za0.s, p1/M, p1/M, z28.b, z12.b\n"
- ".inst 0xa08d2781 // smopa za1.s, p1/M, p1/M, z28.b, z13.b\n"
- ".inst 0xa08c27a2 // smopa za2.s, p1/M, p1/M, z29.b, z12.b\n"
- ".inst 0xa08d27a3 // smopa za3.s, p1/M, p1/M, z29.b, z13.b\n"
+ ".inst 0xa0912460 // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+ ".inst 0xa0992461 // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa0912562 // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa0992563 // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa0962680 // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa0972681 // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa0962782 // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa0972783 // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa09026a0 // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa09826a1 // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa09027a2 // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa09827a3 // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa08724a0 // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa08f24a1 // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa08725a2 // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa08f25a3 // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa0400762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa1400773 // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
".inst 0xa04006f0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0xa0902440 // smopa za0.s, p1/M, p1/M, z2.b, z16.b\n"
- ".inst 0xa0912441 // smopa za1.s, p1/M, p1/M, z2.b, z17.b\n"
- ".inst 0xa0902462 // smopa za2.s, p1/M, p1/M, z3.b, z16.b\n"
- ".inst 0xa0912463 // smopa za3.s, p1/M, p1/M, z3.b, z17.b\n"
+ ".inst 0xa0902660 // smopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+ ".inst 0xa0912661 // smopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+ ".inst 0xa0902762 // smopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+ ".inst 0xa0912763 // smopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040476e // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
"addvl x27, x27, #2\n"
- ".inst 0xc09124e0 // addva za0.s, p1/M, p1/M, z7.s\n"
- ".inst 0xc09124e1 // addva za1.s, p1/M, p1/M, z7.s\n"
+ ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
"tbz x16, #1, 14f\n"
@@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa060c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -260,71 +259,71 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c5dc // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 24f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
- "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
- "ld1rw { z9.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z10.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "add x26, x26, x10\n" // C += n
- "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
- "ldr w22, [%x[args], %[offsetof_n_0]]\n"
- "ldr x21, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+ "add x21, x21, x10\n"
+ "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "add x22, x22, x10\n"
- "add x21, x21, x22, LSL #2\n"
- "add x20, x20, x22, LSL #2\n"
- ".inst 0xa04042a8 // ld1w { z8.s-z9.s }, p8/Z, [x21]\n"
- ".inst 0xa040428a // ld1w { z10.s-z11.s }, p8/Z, [x20]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.h, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a9ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1aaaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1abaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z11.s\n"
- ".inst 0xc1afab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z15.s\n"
- ".inst 0xc1afab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z15.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cea0 // sclamp { z0.s-z3.s }, z21.s, z20.s\n"
- "uzp1 z19.h, z28.h, z0.h\n"
- "uzp1 z18.h, z29.h, z1.h\n"
- "uzp1 z17.h, z30.h, z2.h\n"
- "uzp1 z16.h, z31.h, z3.h\n"
- "st1b { z19.h }, p0, [x26]\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z8.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.h }, p0, [x26]\n"
+ "uzp1 z16.h, z5.h, z9.h\n"
+ "uzp1 z17.h, z6.h, z10.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "uzp1 z16.h, z7.h, z11.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -332,59 +331,60 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a9ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z9.s\n"
- ".inst 0xc1aaaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc1afab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z15.s\n"
- ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- "uzp1 z16.h, z4.h, z28.h\n"
+ ".inst 0xc1a2aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc1a3aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z8.h, z4.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
"subs x20, x20, #0x1\n"
- "uzp1 z16.h, z5.h, z29.h\n"
+ "uzp1 z16.h, z9.h, z5.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
- "uzp1 z16.h, z6.h, z30.h\n"
+ "uzp1 z16.h, z10.h, z6.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 22f\n"
+ "whilelt p0.h, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a9ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z9.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1aaaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z10.s\n"
- ".inst 0xc1abaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1afab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z15.s\n"
- ".inst 0xc1afab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z15.s\n"
- ".inst 0xc1b4cea0 // sclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- "uzp1 z19.h, z0.h, z28.h\n"
- "uzp1 z18.h, z1.h, z29.h\n"
- "uzp1 z17.h, z2.h, z30.h\n"
- "uzp1 z16.h, z3.h, z31.h\n"
- "st1b { z19.h }, p0, [x26]\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z20.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.h }, p0, [x26]\n"
+ "uzp1 z16.h, z5.h, z21.h\n"
+ "uzp1 z17.h, z6.h, z22.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "uzp1 z16.h, z7.h, z23.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -394,15 +394,15 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"cbz x20, 21f\n"
".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a9ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- ".inst 0xc1aaaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc1afab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z15.s\n"
- ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
"uzp1 z16.h, z4.h, z16.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
@@ -420,25 +420,25 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"23:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 23b\n"
"24:" // End block
"incw x10, ALL, MUL #2\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #2\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
index 51fc52f7b7..5e22847853 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
class cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int8_t result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
@@ -61,7 +60,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 4, true> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
index df9a866b6d..7b8d34d350 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
@@ -49,7 +49,7 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
C(C), ldcb(ldc * sizeof(int8_t)),
M(M), N(N), K(K),
- min(0), max(0),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias), n_0(n_0),
accumulator_buffer(accumulator_buffer),
@@ -74,14 +74,13 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
const long kstride_bytes;
int8_t *const C;
const long ldcb;
- const long M, N, K;
- int32_t min;
- int32_t max;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<int8_t>::min();
+ int32_t max = std::numeric_limits<int8_t>::max();
const int32_t *const bias;
const int n_0;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -100,17 +99,17 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -125,95 +124,95 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "ld1w { z6.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc09024c0 // addha za0.s, p1/M, p1/M, z6.s\n"
- ".inst 0xc09024c1 // addha za1.s, p1/M, p1/M, z6.s\n"
- ".inst 0xc09024c2 // addha za2.s, p1/M, p1/M, z6.s\n"
- ".inst 0xc09024c3 // addha za3.s, p1/M, p1/M, z6.s\n"
+ "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902500 // addha za0.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902501 // addha za1.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902502 // addha za2.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902503 // addha za3.s, p1/M, p1/M, z8.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20\n"
"incw x21, ALL, MUL #4\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1408360 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn8.b/Z, [x27]\n"
- "ld1b { z29.b }, p1/Z, [x23]\n"
- ".inst 0xa1418361 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1b { z31.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa09d2400 // smopa za0.s, p1/M, p1/M, z0.b, z29.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa09d2481 // smopa za1.s, p1/M, p1/M, z4.b, z29.b\n"
- ".inst 0xa09d2502 // smopa za2.s, p1/M, p1/M, z8.b, z29.b\n"
- ".inst 0xa09d2583 // smopa za3.s, p1/M, p1/M, z12.b, z29.b\n"
- ".inst 0xa1408360 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0932420 // smopa za0.s, p1/M, p1/M, z1.b, z19.b\n"
- "ld1b { z29.b }, p1/Z, [x23]\n"
- ".inst 0xa09324a1 // smopa za1.s, p1/M, p1/M, z5.b, z19.b\n"
- ".inst 0xa0932522 // smopa za2.s, p1/M, p1/M, z9.b, z19.b\n"
- ".inst 0xa09325a3 // smopa za3.s, p1/M, p1/M, z13.b, z19.b\n"
- ".inst 0xa1418361 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0942460 // smopa za0.s, p1/M, p1/M, z3.b, z20.b\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa09424e1 // smopa za1.s, p1/M, p1/M, z7.b, z20.b\n"
- ".inst 0xa0942562 // smopa za2.s, p1/M, p1/M, z11.b, z20.b\n"
- ".inst 0xa09425e3 // smopa za3.s, p1/M, p1/M, z15.b, z20.b\n"
- ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa09f2700 // smopa za0.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa09f2721 // smopa za1.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa09f2742 // smopa za2.s, p1/M, p1/M, z26.b, z31.b\n"
- ".inst 0xa09f2763 // smopa za3.s, p1/M, p1/M, z27.b, z31.b\n"
- ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa08e2480 // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa08e24a1 // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa08e24c2 // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa08e24e3 // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa09f2680 // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa09f26a1 // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa09f26c2 // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa09f26e3 // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa08d2700 // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa08d2721 // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa08d2742 // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa08d2763 // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa09d2500 // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa09d2521 // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa09d2542 // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09d2563 // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1b { z31.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa09d2400 // smopa za0.s, p1/M, p1/M, z0.b, z29.b\n"
- ".inst 0xa09d2481 // smopa za1.s, p1/M, p1/M, z4.b, z29.b\n"
- ".inst 0xa09d2502 // smopa za2.s, p1/M, p1/M, z8.b, z29.b\n"
- ".inst 0xa09d2583 // smopa za3.s, p1/M, p1/M, z12.b, z29.b\n"
- ".inst 0xa0932420 // smopa za0.s, p1/M, p1/M, z1.b, z19.b\n"
- ".inst 0xa09324a1 // smopa za1.s, p1/M, p1/M, z5.b, z19.b\n"
- ".inst 0xa0932522 // smopa za2.s, p1/M, p1/M, z9.b, z19.b\n"
- ".inst 0xa09325a3 // smopa za3.s, p1/M, p1/M, z13.b, z19.b\n"
- ".inst 0xa0942460 // smopa za0.s, p1/M, p1/M, z3.b, z20.b\n"
- ".inst 0xa09424e1 // smopa za1.s, p1/M, p1/M, z7.b, z20.b\n"
- ".inst 0xa0942562 // smopa za2.s, p1/M, p1/M, z11.b, z20.b\n"
- ".inst 0xa09425e3 // smopa za3.s, p1/M, p1/M, z15.b, z20.b\n"
- ".inst 0xa09f2700 // smopa za0.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa09f2721 // smopa za1.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa09f2742 // smopa za2.s, p1/M, p1/M, z26.b, z31.b\n"
- ".inst 0xa09f2763 // smopa za3.s, p1/M, p1/M, z27.b, z31.b\n"
+ ".inst 0xa08e2480 // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+ ".inst 0xa08e24a1 // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa08e24c2 // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa08e24e3 // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa09f2680 // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ ".inst 0xa09f26a1 // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa09f26c2 // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa09f26e3 // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa08d2700 // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ ".inst 0xa08d2721 // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa08d2742 // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa08d2763 // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa09d2500 // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa09d2521 // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa09d2542 // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09d2563 // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
"ld1b { z15.b }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
@@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa060c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14]\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa061c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 30f\n"
@@ -260,56 +259,56 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
"ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x26, x11, x24, x26\n" // C += m * ldc
"ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
- "add x26, x26, x10\n" // C += n
- "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
- "madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
- "ldr w22, [%x[args], %[offsetof_n_0]]\n"
- "ldr x21, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+ "add x21, x21, x10\n"
+ "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "add x22, x22, x10\n"
- "add x21, x21, x22, LSL #2\n"
- "add x20, x20, x22, LSL #2\n"
- "ld1w { z2.s }, p0/Z, [x21]\n"
+ "add x20, x20, x21, LSL #2\n"
"ld1w { z1.s }, p0/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z17.s }, p0, [x26]\n"
@@ -321,55 +320,56 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
- ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
- "st1b { z4.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
"subs x20, x20, #0x1\n"
- "st1b { z5.s }, p0, [x26]\n"
+ "st1b { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
- "st1b { z6.s }, p0, [x26]\n"
+ "st1b { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 28f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z19.s }, p0, [x26]\n"
+ "st1b { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 19b\n"
"20:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 21f\n"
".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- "subs x20, x20, #0x1\n"
".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ "subs x20, x20, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
- ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
"st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
@@ -382,113 +382,115 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"21:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
"beq 28f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
- ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
- "st1b { z12.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1b4cea8 // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ "st1b { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z13.s }, p0, [x26]\n"
+ "st1b { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z14.s }, p0, [x26]\n"
+ "st1b { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z15.s }, p0, [x26]\n"
+ "st1b { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 24f\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ "st1b { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
"subs x20, x20, #0x1\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z13.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z14.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"24:" // Store to output array: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 28f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 26f\n"
"25:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z19.s }, p0, [x26]\n"
+ "st1b { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 25b\n"
"26:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 27f\n"
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
- ".inst 0xc1a1aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
- ".inst 0xc1a0ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
- ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
- "st1b { z20.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
"subs x20, x20, #0x1\n"
- "st1b { z21.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
- "st1b { z22.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"27:" // Store to output array: Accumulator row 3 oddments: End
"28:" // Store to output array: End
"tbz x16, #0, 30f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 29b\n"
"30:" // End block
"incw x10\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #4\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp
index fe8f5383bd..7792192856 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const int8_t *const A, const
class cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef float result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 4> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp
index bf60c61fc0..4b26a6578c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp
@@ -40,8 +40,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const int8_t *const A, const
const int8_t *const B,
float *const C, const int ldc,
const int M, const int N, const int K,
- const int32_t *const bias,
- const DequantizeFloat &, const float *const late_bias, const Activation act,
+ const int32_t *const bias, const float *const late_bias, const Activation act,
bool accumulate,
int32_t *const accumulator_buffer
) : A(A),
@@ -95,7 +94,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const int8_t *const A, const
};
// Construct arguments for this kernel
- KernelArgs args(A, B, C, ldc, M, N, K, bias, dq, late_bias, act, accumulate, accumulator_buffer);
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, late_bias, act, accumulate, accumulator_buffer);
__asm__ __volatile__(
"ldr x13, [%x[args], %[offsetof_flags]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp
index edb6737dc7..df2c9c0ca3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const int8_t *const A, const
class cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef float result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 4> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp
index ff649f11f6..1631fae8e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp
@@ -40,8 +40,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const int8_t *const A, const
const int8_t *const B,
float *const C, const int ldc,
const int M, const int N, const int K,
- const int32_t *const bias,
- const DequantizeFloat &, const float *const late_bias, const Activation act,
+ const int32_t *const bias, const float *const late_bias, const Activation act,
bool accumulate,
int32_t *const accumulator_buffer
) : A(A),
@@ -95,7 +94,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const int8_t *const A, const
};
// Construct arguments for this kernel
- KernelArgs args(A, B, C, ldc, M, N, K, bias, dq, late_bias, act, accumulate, accumulator_buffer);
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, late_bias, act, accumulate, accumulator_buffer);
__asm__ __volatile__(
"ldr x16, [%x[args], %[offsetof_flags]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp
index 112f5ef0e8..70952f4f03 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const int8_t *const A, const
class cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef float result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 4> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp
index a08ea8311a..bafb16bca8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp
@@ -40,8 +40,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const int8_t *const A, const
const int8_t *const B,
float *const C, const int ldc,
const int M, const int N, const int K,
- const int32_t *const bias,
- const DequantizeFloat &, const float *const late_bias, const Activation act,
+ const int32_t *const bias, const float *const late_bias, const Activation act,
bool accumulate,
int32_t *const accumulator_buffer
) : A(A),
@@ -95,7 +94,7 @@ void sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const int8_t *const A, const
};
// Construct arguments for this kernel
- KernelArgs args(A, B, C, ldc, M, N, K, bias, dq, late_bias, act, accumulate, accumulator_buffer);
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, late_bias, act, accumulate, accumulator_buffer);
__asm__ __volatile__(
"ldr x16, [%x[args], %[offsetof_flags]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
index 0c8de041cb..84386009a0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 4> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
index a643fb265b..67c759410a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
C(C), ldcb(ldc * sizeof(int32_t)),
M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias),
accumulator_buffer(accumulator_buffer),
@@ -68,11 +69,10 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
const long kstride_bytes;
int32_t *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
const int32_t *const bias;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -92,16 +92,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
".inst 0xa040c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11]\n"
- ".inst 0xa041c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
- ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- "addvl x11, x11, #16\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x11, x11, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w9, [%x[args], %[offsetof_M]]\n"
@@ -116,102 +116,102 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa01bc288 // ld1w { z8.s-z11.s }, p8/Z, [x20, x27, LSL #2]\n"
- ".inst 0xc0900100 // addha za0.s, p0/M, p0/M, z8.s\n"
- ".inst 0xc0900121 // addha za1.s, p0/M, p0/M, z9.s\n"
- ".inst 0xc0900142 // addha za2.s, p0/M, p0/M, z10.s\n"
- ".inst 0xc0900163 // addha za3.s, p0/M, p0/M, z11.s\n"
+ ".inst 0xa11bc29b // ldnt1w { z19.s, z23.s, z27.s, z31.s }, p8/Z, [x20, x27, LSL #2]\n"
+ ".inst 0xc0900260 // addha za0.s, p0/M, p0/M, z19.s\n"
+ ".inst 0xc09002e1 // addha za1.s, p0/M, p0/M, z23.s\n"
+ ".inst 0xc0900362 // addha za2.s, p0/M, p0/M, z27.s\n"
+ ".inst 0xc09003e3 // addha za3.s, p0/M, p0/M, z31.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x27\n"
"mov x21, x28\n"
"incw x20, ALL, MUL #4\n"
"incw x21\n"
"cmp x20, x26\n"
- "mov x20, x13\n"
"csel x21, x28, x21, LT\n"
+ "mov x20, x13\n"
"bfm x13, XZR, #0x0, #0x0 // bfc x13, #0x0, #0x1\n"
"cmp x21, x9\n"
"csel x13, x20, x13, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x27, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- "ld1b { z21.b }, p0/Z, [x24]\n"
- ".inst 0xa04086f8 // ld1b { z24.b-z27.b }, pn9.b/Z, [x23]\n"
- "ld1b { z6.b }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0xa14186e1 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1b { z31.b }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0xa14286e3 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1b { z23.b }, p0/Z, [x24, #3, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x27, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ "ld1b { z30.b }, p0/Z, [x24]\n"
+ ".inst 0xa04086e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+ "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- ".inst 0xa14386e0 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
+ ".inst 0xa08003c0 // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa08103c1 // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+ ".inst 0xa08203c2 // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+ ".inst 0xa08303c3 // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+ "ld1b { z30.b }, p0/Z, [x24]\n"
".inst 0xa09802a0 // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
- "subs x21, x21, #0x1\n"
+ ".inst 0xa04086e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
".inst 0xa09902a1 // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
".inst 0xa09a02a2 // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
".inst 0xa09b02a3 // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
- "ld1b { z21.b }, p0/Z, [x24]\n"
- ".inst 0xa08100c0 // smopa za0.s, p0/M, p0/M, z6.b, z1.b\n"
- ".inst 0xa04086f8 // ld1b { z24.b-z27.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa08500c1 // smopa za1.s, p0/M, p0/M, z6.b, z5.b\n"
- ".inst 0xa08900c2 // smopa za2.s, p0/M, p0/M, z6.b, z9.b\n"
- ".inst 0xa08d00c3 // smopa za3.s, p0/M, p0/M, z6.b, z13.b\n"
- "ld1b { z6.b }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0xa08303e0 // smopa za0.s, p0/M, p0/M, z31.b, z3.b\n"
- ".inst 0xa14186e1 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa08703e1 // smopa za1.s, p0/M, p0/M, z31.b, z7.b\n"
- ".inst 0xa08b03e2 // smopa za2.s, p0/M, p0/M, z31.b, z11.b\n"
- ".inst 0xa08f03e3 // smopa za3.s, p0/M, p0/M, z31.b, z15.b\n"
- "ld1b { z31.b }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0xa14286e3 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0xa08002e0 // smopa za0.s, p0/M, p0/M, z23.b, z0.b\n"
- ".inst 0xa08402e1 // smopa za1.s, p0/M, p0/M, z23.b, z4.b\n"
- ".inst 0xa08802e2 // smopa za2.s, p0/M, p0/M, z23.b, z8.b\n"
- ".inst 0xa08c02e3 // smopa za3.s, p0/M, p0/M, z23.b, z12.b\n"
- "ld1b { z23.b }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa0840380 // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0850381 // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+ ".inst 0xa0860382 // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+ ".inst 0xa0870383 // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+ "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xa0900160 // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+ ".inst 0xa0910161 // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+ ".inst 0xa0920162 // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+ ".inst 0xa0930163 // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
+ "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- ".inst 0xa14386e0 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
+ ".inst 0xa08003c0 // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
+ ".inst 0xa08103c1 // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+ ".inst 0xa08203c2 // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+ ".inst 0xa08303c3 // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
".inst 0xa09802a0 // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
".inst 0xa09902a1 // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
".inst 0xa09a02a2 // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
".inst 0xa09b02a3 // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
- ".inst 0xa08100c0 // smopa za0.s, p0/M, p0/M, z6.b, z1.b\n"
- ".inst 0xa08500c1 // smopa za1.s, p0/M, p0/M, z6.b, z5.b\n"
- ".inst 0xa08900c2 // smopa za2.s, p0/M, p0/M, z6.b, z9.b\n"
- ".inst 0xa08d00c3 // smopa za3.s, p0/M, p0/M, z6.b, z13.b\n"
- ".inst 0xa08303e0 // smopa za0.s, p0/M, p0/M, z31.b, z3.b\n"
- ".inst 0xa08703e1 // smopa za1.s, p0/M, p0/M, z31.b, z7.b\n"
- ".inst 0xa08b03e2 // smopa za2.s, p0/M, p0/M, z31.b, z11.b\n"
- ".inst 0xa08f03e3 // smopa za3.s, p0/M, p0/M, z31.b, z15.b\n"
- ".inst 0xa08002e0 // smopa za0.s, p0/M, p0/M, z23.b, z0.b\n"
- ".inst 0xa08402e1 // smopa za1.s, p0/M, p0/M, z23.b, z4.b\n"
- ".inst 0xa08802e2 // smopa za2.s, p0/M, p0/M, z23.b, z8.b\n"
- ".inst 0xa08c02e3 // smopa za3.s, p0/M, p0/M, z23.b, z12.b\n"
+ ".inst 0xa0840380 // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+ ".inst 0xa0850381 // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+ ".inst 0xa0860382 // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+ ".inst 0xa0870383 // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+ ".inst 0xa0900160 // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+ ".inst 0xa0910161 // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+ ".inst 0xa0920162 // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+ ".inst 0xa0930163 // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1b { z14.b }, p0/Z, [x24]\n"
- "subs x20, x20, #0x1\n"
+ "ld1b { z22.b }, p0/Z, [x24]\n"
+ "subs x21, x21, #0x1\n"
"addvl x24, x24, #1\n"
- ".inst 0xa14086e1 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa14086f1 // ld1b { z17.b, z21.b, z25.b, z29.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0xa08101c0 // smopa za0.s, p0/M, p0/M, z14.b, z1.b\n"
- ".inst 0xa08501c1 // smopa za1.s, p0/M, p0/M, z14.b, z5.b\n"
- ".inst 0xa08901c2 // smopa za2.s, p0/M, p0/M, z14.b, z9.b\n"
- ".inst 0xa08d01c3 // smopa za3.s, p0/M, p0/M, z14.b, z13.b\n"
+ ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+ ".inst 0xa09502c1 // smopa za1.s, p0/M, p0/M, z22.b, z21.b\n"
+ ".inst 0xa09902c2 // smopa za2.s, p0/M, p0/M, z22.b, z25.b\n"
+ ".inst 0xa09d02c3 // smopa za3.s, p0/M, p0/M, z22.b, z29.b\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x13, #1, 14f\n"
@@ -219,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c578 // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa040c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa042c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x11, x11, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa060c548 // st1w { z8.s-z11.s }, pn9.b, [x10]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa061c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0x4, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c55c // st1w { z28.s-z31.s }, pn9.b, [x10, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ ".inst 0xa060c544 // st1w { z4.s-z7.s }, pn9.b, [x10]\n"
+ "addvl x11, x11, #16\n"
+ ".inst 0xa061c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ ".inst 0xa062c55c // st1w { z28.s-z31.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
"addvl x10, x10, #16\n"
"blt 11b\n"
"b 20f\n"
@@ -245,16 +245,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c544 // st1w { z4.s-z7.s }, pn9.b, [x10]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa060c54c // st1w { z12.s-z15.s }, pn9.b, [x10]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa061c544 // st1w { z4.s-z7.s }, pn9.b, [x10, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
- ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ ".inst 0xa062c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0xc, MUL VL]\n"
"addvl x10, x10, #16\n"
"blt 13b\n"
"b 20f\n"
@@ -264,11 +264,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"cntw x20\n"
"ldr x22, [%x[args], %[offsetof_ldcb]]\n"
"cmp x21, x20\n"
- "mov x12, #0x0\n"
"csel x20, x21, x20, LT\n"
"add x23, x23, x27, LSL #2\n" // C += n
"lsr x21, x20, #0x2\n"
"madd x23, x28, x22, x23\n" // C += m * ldc
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Accumulator row 0 loop
@@ -278,55 +278,55 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa160c2e0 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
"add x23, x23, x22\n"
- "add x12, x12, #0x4\n"
".inst 0xa160c2e1 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
"add x23, x23, x22\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa160c2e2 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
"add x23, x23, x22\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa160c2e3 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x23]\n"
"add x23, x23, x22\n"
"blt 15b\n"
"16:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa160c2e0 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa160c2f0 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x23]\n"
"add x23, x23, x22\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c2e1 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
+ ".inst 0xa160c2f1 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x23]\n"
"add x23, x23, x22\n"
"beq 17f\n"
- ".inst 0xa160c2e2 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
+ ".inst 0xa160c2f2 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x23]\n"
"17:" // Store to output array: Accumulator row 0 oddments: End
"18:" // Store to output array: End
"tbz x13, #0, 20f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"19:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11]\n"
- ".inst 0xa041c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
- ".inst 0xa042c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa040c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x11, x11, #16\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x11, x11, #16\n"
"blt 19b\n"
"20:" // End block
"incw x27, ALL, MUL #4\n"
"cmp x27, x26\n"
"blt 3b\n"
"incw x28\n"
- "mov x27, #0x0\n"
"cmp x28, x9\n"
+ "mov x27, #0x0\n"
"mov x25, x24\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
index 074a8819f9..2899d7553c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 4> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
index ae14cd7d50..7f44e5ffe5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
C(C), ldcb(ldc * sizeof(int32_t)),
M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias),
accumulator_buffer(accumulator_buffer),
@@ -68,11 +69,10 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
const long kstride_bytes;
int32_t *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
const int32_t *const bias;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -91,17 +91,17 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -116,102 +116,102 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa10a4296 // ld1w { z22.s, z30.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc09002c0 // addha za0.s, p0/M, p0/M, z22.s\n"
- ".inst 0xc09003c1 // addha za1.s, p0/M, p0/M, z30.s\n"
- ".inst 0xc09002c2 // addha za2.s, p0/M, p0/M, z22.s\n"
- ".inst 0xc09003c3 // addha za3.s, p0/M, p0/M, z30.s\n"
+ ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0900280 // addha za0.s, p0/M, p0/M, z20.s\n"
+ ".inst 0xc09002a1 // addha za1.s, p0/M, p0/M, z21.s\n"
+ ".inst 0xc0900282 // addha za2.s, p0/M, p0/M, z20.s\n"
+ ".inst 0xc09002a3 // addha za3.s, p0/M, p0/M, z21.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20, ALL, MUL #2\n"
"incw x21, ALL, MUL #2\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1400766 // ld1b { z6.b, z14.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa04006e2 // ld1b { z2.b-z3.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa041077a // ld1b { z26.b-z27.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04106f6 // ld1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa1420765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206f5 // ld1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0430760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa040077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa14006e8 // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0410762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa14106ff // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa042076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306f1 // ld1b { z17.b, z25.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04306f5 // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa08200c0 // smopa za0.s, p0/M, p0/M, z6.b, z2.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa08300c1 // smopa za1.s, p0/M, p0/M, z6.b, z3.b\n"
- ".inst 0xa08201c2 // smopa za2.s, p0/M, p0/M, z14.b, z2.b\n"
- ".inst 0xa08301c3 // smopa za3.s, p0/M, p0/M, z14.b, z3.b\n"
- ".inst 0xa1400766 // ld1b { z6.b, z14.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa0960340 // smopa za0.s, p0/M, p0/M, z26.b, z22.b\n"
- ".inst 0xa04006e2 // ld1b { z2.b-z3.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa0970341 // smopa za1.s, p0/M, p0/M, z26.b, z23.b\n"
- ".inst 0xa0960362 // smopa za2.s, p0/M, p0/M, z27.b, z22.b\n"
- ".inst 0xa0970363 // smopa za3.s, p0/M, p0/M, z27.b, z23.b\n"
- ".inst 0xa041077a // ld1b { z26.b-z27.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa09500a0 // smopa za0.s, p0/M, p0/M, z5.b, z21.b\n"
- ".inst 0xa04106f6 // ld1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa09d00a1 // smopa za1.s, p0/M, p0/M, z5.b, z29.b\n"
- ".inst 0xa09501a2 // smopa za2.s, p0/M, p0/M, z13.b, z21.b\n"
- ".inst 0xa09d01a3 // smopa za3.s, p0/M, p0/M, z13.b, z29.b\n"
- ".inst 0xa1420765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206f5 // ld1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0910000 // smopa za0.s, p0/M, p0/M, z0.b, z17.b\n"
- ".inst 0xa0990001 // smopa za1.s, p0/M, p0/M, z0.b, z25.b\n"
- ".inst 0xa0910022 // smopa za2.s, p0/M, p0/M, z1.b, z17.b\n"
- ".inst 0xa0990023 // smopa za3.s, p0/M, p0/M, z1.b, z25.b\n"
- ".inst 0xa0430760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa0800380 // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0880381 // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+ ".inst 0xa08003a2 // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+ ".inst 0xa08803a3 // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+ ".inst 0xa040077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa0970040 // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+ ".inst 0xa14006e8 // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa09f0041 // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+ ".inst 0xa0970062 // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+ ".inst 0xa09f0063 // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+ ".inst 0xa0410762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa09001c0 // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+ ".inst 0xa14106ff // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa09801c1 // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+ ".inst 0xa09001e2 // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+ ".inst 0xa09801e3 // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+ ".inst 0xa042076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0940080 // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+ ".inst 0xa0950081 // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+ ".inst 0xa09400a2 // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+ ".inst 0xa09500a3 // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
+ ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306f1 // ld1b { z17.b, z25.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04306f5 // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa08200c0 // smopa za0.s, p0/M, p0/M, z6.b, z2.b\n"
- ".inst 0xa08300c1 // smopa za1.s, p0/M, p0/M, z6.b, z3.b\n"
- ".inst 0xa08201c2 // smopa za2.s, p0/M, p0/M, z14.b, z2.b\n"
- ".inst 0xa08301c3 // smopa za3.s, p0/M, p0/M, z14.b, z3.b\n"
- ".inst 0xa0960340 // smopa za0.s, p0/M, p0/M, z26.b, z22.b\n"
- ".inst 0xa0970341 // smopa za1.s, p0/M, p0/M, z26.b, z23.b\n"
- ".inst 0xa0960362 // smopa za2.s, p0/M, p0/M, z27.b, z22.b\n"
- ".inst 0xa0970363 // smopa za3.s, p0/M, p0/M, z27.b, z23.b\n"
- ".inst 0xa09500a0 // smopa za0.s, p0/M, p0/M, z5.b, z21.b\n"
- ".inst 0xa09d00a1 // smopa za1.s, p0/M, p0/M, z5.b, z29.b\n"
- ".inst 0xa09501a2 // smopa za2.s, p0/M, p0/M, z13.b, z21.b\n"
- ".inst 0xa09d01a3 // smopa za3.s, p0/M, p0/M, z13.b, z29.b\n"
- ".inst 0xa0910000 // smopa za0.s, p0/M, p0/M, z0.b, z17.b\n"
- ".inst 0xa0990001 // smopa za1.s, p0/M, p0/M, z0.b, z25.b\n"
- ".inst 0xa0910022 // smopa za2.s, p0/M, p0/M, z1.b, z17.b\n"
- ".inst 0xa0990023 // smopa za3.s, p0/M, p0/M, z1.b, z25.b\n"
+ ".inst 0xa0800380 // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
+ ".inst 0xa0880381 // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+ ".inst 0xa08003a2 // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+ ".inst 0xa08803a3 // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+ ".inst 0xa0970040 // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+ ".inst 0xa09f0041 // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+ ".inst 0xa0970062 // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+ ".inst 0xa09f0063 // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+ ".inst 0xa09001c0 // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+ ".inst 0xa09801c1 // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+ ".inst 0xa09001e2 // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+ ".inst 0xa09801e3 // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+ ".inst 0xa0940080 // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+ ".inst 0xa0950081 // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+ ".inst 0xa09400a2 // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+ ".inst 0xa09500a3 // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa1400777 // ld1b { z23.b, z31.b }, pn9.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa1400774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27]\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
".inst 0xa14006e7 // ld1b { z7.b, z15.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0xa08702e0 // smopa za0.s, p0/M, p0/M, z23.b, z7.b\n"
- ".inst 0xa08f02e1 // smopa za1.s, p0/M, p0/M, z23.b, z15.b\n"
- ".inst 0xa08703e2 // smopa za2.s, p0/M, p0/M, z31.b, z7.b\n"
- ".inst 0xa08f03e3 // smopa za3.s, p0/M, p0/M, z31.b, z15.b\n"
+ ".inst 0xa0870280 // smopa za0.s, p0/M, p0/M, z20.b, z7.b\n"
+ ".inst 0xa08f0281 // smopa za1.s, p0/M, p0/M, z20.b, z15.b\n"
+ ".inst 0xa0870382 // smopa za2.s, p0/M, p0/M, z28.b, z7.b\n"
+ ".inst 0xa08f0383 // smopa za3.s, p0/M, p0/M, z28.b, z15.b\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -219,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa060c5dc // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 23f\n"
@@ -245,16 +245,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xa060c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
- "add x12, x12, #0x4\n"
+ ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 23f\n"
@@ -264,25 +264,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"cntw x24\n"
"ldr x23, [%x[args], %[offsetof_ldcb]]\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x22, x25, x24, LT\n"
"add x26, x26, x10, LSL #2\n" // C += n
"lsr x21, x22, #0x2\n"
"madd x26, x11, x23, x26\n" // C += m * ldc
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
- ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ "add x26, x26, x23\n"
+ ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"add x12, x12, #0x4\n"
- ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
+ ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
- "add x26, x26, x23\n"
- ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
+ ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"blt 15b\n"
"16:" // Store to output array: Accumulator row 0 oddments
@@ -303,9 +303,9 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"subs x25, x25, x22\n"
"beq 21f\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x20, x25, x24, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Accumulator row 1 loop
@@ -313,53 +313,53 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x23\n"
- "add x12, x12, #0x4\n"
".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x23\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x23\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"blt 18b\n"
"19:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
+ ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
- ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
+ ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
"20:" // Store to output array: Accumulator row 1 oddments: End
"21:" // Store to output array: End
"tbz x16, #0, 23f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"22:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 22b\n"
"23:" // End block
"incw x10, ALL, MUL #2\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #2\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
index 6b1dca0e2a..0c29ab9991 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 4> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
index 03c19c46f5..932bd6b595 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
@@ -48,6 +48,7 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
C(C), ldcb(ldc * sizeof(int32_t)),
M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias),
accumulator_buffer(accumulator_buffer),
@@ -68,11 +69,10 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
const long kstride_bytes;
int32_t *const C;
const long ldcb;
- const long M, N, K;
+ const long M, N, K, n_loops, n_tail_iters;
const int32_t *const bias;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -91,17 +91,17 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa040c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa041c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -116,102 +116,102 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "ld1w { z1.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n"
- ".inst 0xc0902421 // addha za1.s, p1/M, p1/M, z1.s\n"
- ".inst 0xc0902422 // addha za2.s, p1/M, p1/M, z1.s\n"
- ".inst 0xc0902423 // addha za3.s, p1/M, p1/M, z1.s\n"
+ "ldnt1w { z17.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902620 // addha za0.s, p1/M, p1/M, z17.s\n"
+ ".inst 0xc0902621 // addha za1.s, p1/M, p1/M, z17.s\n"
+ ".inst 0xc0902622 // addha za2.s, p1/M, p1/M, z17.s\n"
+ ".inst 0xc0902623 // addha za3.s, p1/M, p1/M, z17.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20\n"
"incw x21, ALL, MUL #4\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1408370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27]\n"
- "ld1b { z0.b }, p1/Z, [x23]\n"
- ".inst 0xa041836c // ld1b { z12.b-z15.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ld1b { z10.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1428371 // ld1b { z17.b, z21.b, z25.b, z29.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa1438373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+ "ldnt1b { z12.b }, p1/Z, [x23]\n"
+ ".inst 0xa1418370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1b { z7.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa0802600 // smopa za0.s, p1/M, p1/M, z16.b, z0.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa0802681 // smopa za1.s, p1/M, p1/M, z20.b, z0.b\n"
- ".inst 0xa0802702 // smopa za2.s, p1/M, p1/M, z24.b, z0.b\n"
- ".inst 0xa0802783 // smopa za3.s, p1/M, p1/M, z28.b, z0.b\n"
- ".inst 0xa1408370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa08a2580 // smopa za0.s, p1/M, p1/M, z12.b, z10.b\n"
- "ld1b { z0.b }, p1/Z, [x23]\n"
- ".inst 0xa08a25a1 // smopa za1.s, p1/M, p1/M, z13.b, z10.b\n"
- ".inst 0xa08a25c2 // smopa za2.s, p1/M, p1/M, z14.b, z10.b\n"
- ".inst 0xa08a25e3 // smopa za3.s, p1/M, p1/M, z15.b, z10.b\n"
- ".inst 0xa041836c // ld1b { z12.b-z15.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0922620 // smopa za0.s, p1/M, p1/M, z17.b, z18.b\n"
- "ld1b { z10.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa09226a1 // smopa za1.s, p1/M, p1/M, z21.b, z18.b\n"
- ".inst 0xa0922722 // smopa za2.s, p1/M, p1/M, z25.b, z18.b\n"
- ".inst 0xa09227a3 // smopa za3.s, p1/M, p1/M, z29.b, z18.b\n"
- ".inst 0xa1428371 // ld1b { z17.b, z21.b, z25.b, z29.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa0872660 // smopa za0.s, p1/M, p1/M, z19.b, z7.b\n"
- ".inst 0xa08726e1 // smopa za1.s, p1/M, p1/M, z23.b, z7.b\n"
- ".inst 0xa0872762 // smopa za2.s, p1/M, p1/M, z27.b, z7.b\n"
- ".inst 0xa08727e3 // smopa za3.s, p1/M, p1/M, z31.b, z7.b\n"
- ".inst 0xa1438373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa08c2640 // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa08c26c1 // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+ ".inst 0xa08c2742 // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+ ".inst 0xa08c27c3 // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+ ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0852600 // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+ "ldnt1b { z12.b }, p1/Z, [x23]\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0852702 // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+ ".inst 0xa0852783 // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+ ".inst 0xa1418370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa0842460 // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+ "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa08424e1 // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+ ".inst 0xa0842562 // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+ ".inst 0xa08425e3 // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+ ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa0932440 // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+ ".inst 0xa09324c1 // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+ ".inst 0xa0932542 // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+ ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1b { z7.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa0802600 // smopa za0.s, p1/M, p1/M, z16.b, z0.b\n"
- ".inst 0xa0802681 // smopa za1.s, p1/M, p1/M, z20.b, z0.b\n"
- ".inst 0xa0802702 // smopa za2.s, p1/M, p1/M, z24.b, z0.b\n"
- ".inst 0xa0802783 // smopa za3.s, p1/M, p1/M, z28.b, z0.b\n"
- ".inst 0xa08a2580 // smopa za0.s, p1/M, p1/M, z12.b, z10.b\n"
- ".inst 0xa08a25a1 // smopa za1.s, p1/M, p1/M, z13.b, z10.b\n"
- ".inst 0xa08a25c2 // smopa za2.s, p1/M, p1/M, z14.b, z10.b\n"
- ".inst 0xa08a25e3 // smopa za3.s, p1/M, p1/M, z15.b, z10.b\n"
- ".inst 0xa0922620 // smopa za0.s, p1/M, p1/M, z17.b, z18.b\n"
- ".inst 0xa09226a1 // smopa za1.s, p1/M, p1/M, z21.b, z18.b\n"
- ".inst 0xa0922722 // smopa za2.s, p1/M, p1/M, z25.b, z18.b\n"
- ".inst 0xa09227a3 // smopa za3.s, p1/M, p1/M, z29.b, z18.b\n"
- ".inst 0xa0872660 // smopa za0.s, p1/M, p1/M, z19.b, z7.b\n"
- ".inst 0xa08726e1 // smopa za1.s, p1/M, p1/M, z23.b, z7.b\n"
- ".inst 0xa0872762 // smopa za2.s, p1/M, p1/M, z27.b, z7.b\n"
- ".inst 0xa08727e3 // smopa za3.s, p1/M, p1/M, z31.b, z7.b\n"
+ ".inst 0xa08c2640 // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
+ ".inst 0xa08c26c1 // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+ ".inst 0xa08c2742 // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+ ".inst 0xa08c27c3 // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+ ".inst 0xa0852600 // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0852702 // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+ ".inst 0xa0852783 // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+ ".inst 0xa0842460 // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+ ".inst 0xa08424e1 // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+ ".inst 0xa0842562 // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+ ".inst 0xa08425e3 // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+ ".inst 0xa0932440 // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+ ".inst 0xa09324c1 // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+ ".inst 0xa0932542 // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa0408368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27]\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
"ld1b { z15.b }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0xa08f2640 // smopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
- ".inst 0xa08f26c1 // smopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
- ".inst 0xa08f2742 // smopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
- ".inst 0xa08f27c3 // smopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
+ ".inst 0xa08f2500 // smopa za0.s, p1/M, p1/M, z8.b, z15.b\n"
+ ".inst 0xa08f2521 // smopa za1.s, p1/M, p1/M, z9.b, z15.b\n"
+ ".inst 0xa08f2542 // smopa za2.s, p1/M, p1/M, z10.b, z15.b\n"
+ ".inst 0xa08f2563 // smopa za3.s, p1/M, p1/M, z11.b, z15.b\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -219,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa060c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa061c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 29f\n"
@@ -247,14 +247,14 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"13:" // Store to partial result buffer: Store only: Loop
".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
- "add x12, x12, #0x4\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 29f\n"
@@ -264,94 +264,94 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"cntw x24\n"
"ldr x23, [%x[args], %[offsetof_ldcb]]\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x22, x25, x24, LT\n"
"add x26, x26, x10, LSL #2\n" // C += n
"lsr x21, x22, #0x2\n"
"madd x26, x11, x23, x26\n" // C += m * ldc
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ "st1w { z8.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z9.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z10.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z15.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z11.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 15b\n"
"16:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"17:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x22, x25, x24, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z0.s }, p0, [x26]\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z1.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z2.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z3.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 18b\n"
"19:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ "st1w { z20.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z21.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "st1w { z22.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"20:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x22, x25, x24, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- "add x12, x12, #0x4\n"
- "st1w { z8.s }, p0, [x26]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ "st1w { z24.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "cmp x12, x21, LSL #2\n"
- "st1w { z9.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z10.s }, p0, [x26]\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z26.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z11.s }, p0, [x26]\n"
+ "cmp x12, x21, LSL #2\n"
+ "st1w { z27.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 21b\n"
"22:" // Store to output array: Accumulator row 2 oddments
@@ -371,21 +371,21 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"subs x25, x25, x22\n"
"beq 27f\n"
"cmp x25, x24\n"
- "mov x12, #0x0\n"
"csel x20, x25, x24, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Accumulator row 3 loop
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- "add x12, x12, #0x4\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "cmp x12, x21, LSL #2\n"
"st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x23\n"
+ "add x12, x12, #0x4\n"
"st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x23\n"
+ "cmp x12, x21, LSL #2\n"
"st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 24b\n"
@@ -407,25 +407,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"28:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 28b\n"
"29:" // End block
"incw x10\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #4\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
index f384e0f491..f540d3fa24 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
class cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL
{
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint8_t result_type;
typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
@@ -61,7 +60,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_1VLx4VL;
- StdTransformsSME<lhs_operand_type, result_type, 1, 4, 4, true> transforms = {};
+ StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
index 0482a5ea0f..0a468e0ff7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
@@ -49,7 +49,7 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
C(C), ldcb(ldc * sizeof(uint8_t)),
M(M), N(N), K(K),
- min(0), max(0),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias), n_0(n_0),
accumulator_buffer(accumulator_buffer),
@@ -74,14 +74,13 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
const long kstride_bytes;
uint8_t *const C;
const long ldcb;
- const long M, N, K;
- int32_t min;
- int32_t max;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<uint8_t>::min();
+ int32_t max = std::numeric_limits<uint8_t>::max();
const int32_t *const bias;
const int n_0;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -90,131 +89,131 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ "ldr x14, [%x[args], %[offsetof_flags]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"ptrue p1.b\n"
".inst 0x25207811 // ptrue pn9.b\n"
- "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
"ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
- "tbz x15, #0, 2f\n"
+ "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x14, #0, 2f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14]\n"
- ".inst 0xa041c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x13, x13, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
- "ldr w11, [%x[args], %[offsetof_M]]\n"
- "mov x10, #0x0\n"
+ "ldr w10, [%x[args], %[offsetof_M]]\n"
"mov x9, #0x0\n"
- "ldr w28, [%x[args], %[offsetof_N]]\n"
- "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "mov x28, #0x0\n"
+ "ldr w27, [%x[args], %[offsetof_N]]\n"
+ "ldr x26, [%x[args], %[offsetof_A]]\n"
"3:" // M and N loop
- "mov x26, x27\n"
- ".inst 0x25bc6530 // whilelt pn8.s, x9, x28, VLx4\n"
- "tbnz x15, #0, 4f\n"
+ "mov x25, x26\n"
+ ".inst 0x25bb6790 // whilelt pn8.s, x28, x27, VLx4\n"
+ "tbnz x14, #0, 4f\n"
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa009c290 // ld1w { z16.s-z19.s }, p8/Z, [x20, x9, LSL #2]\n"
- ".inst 0xc0902600 // addha za0.s, p1/M, p1/M, z16.s\n"
- ".inst 0xc0902621 // addha za1.s, p1/M, p1/M, z17.s\n"
- ".inst 0xc0902642 // addha za2.s, p1/M, p1/M, z18.s\n"
- ".inst 0xc0902663 // addha za3.s, p1/M, p1/M, z19.s\n"
+ ".inst 0xa11cc289 // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+ ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n"
+ ".inst 0xc09024a1 // addha za1.s, p1/M, p1/M, z5.s\n"
+ ".inst 0xc0902522 // addha za2.s, p1/M, p1/M, z9.s\n"
+ ".inst 0xc09025a3 // addha za3.s, p1/M, p1/M, z13.s\n"
"4:" // Prepare accumulators: Test for last block
- "mov x20, x9\n"
- "mov x21, x10\n"
+ "mov x20, x28\n"
+ "mov x21, x9\n"
"incw x20, ALL, MUL #4\n"
"incw x21\n"
- "cmp x20, x28\n"
- "mov x20, x15\n"
- "csel x21, x10, x21, LT\n"
- "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
- "cmp x21, x11\n"
- "csel x15, x20, x15, LT\n"
+ "cmp x20, x27\n"
+ "csel x21, x9, x21, LT\n"
+ "mov x20, x14\n"
+ "bfm x14, XZR, #0x0, #0x0 // bfc x14, #0x0, #0x1\n"
+ "cmp x21, x10\n"
+ "csel x14, x20, x14, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x9, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- "ld1b { z5.b }, p1/Z, [x26]\n"
- ".inst 0xa14086e0 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23]\n"
- "ld1b { z31.b }, p1/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa14186f2 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1b { z1.b }, p1/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa14286f0 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1b { z6.b }, p1/Z, [x26, #3, MUL VL]\n"
- "addvl x26, x26, #4\n"
- ".inst 0xa14386e3 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x28, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa1a024a0 // umopa za0.s, p1/M, p1/M, z5.b, z0.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1a424a1 // umopa za1.s, p1/M, p1/M, z5.b, z4.b\n"
- ".inst 0xa1a824a2 // umopa za2.s, p1/M, p1/M, z5.b, z8.b\n"
- ".inst 0xa1ac24a3 // umopa za3.s, p1/M, p1/M, z5.b, z12.b\n"
- "ld1b { z5.b }, p1/Z, [x26]\n"
- ".inst 0xa1b227e0 // umopa za0.s, p1/M, p1/M, z31.b, z18.b\n"
- ".inst 0xa14086e0 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1b627e1 // umopa za1.s, p1/M, p1/M, z31.b, z22.b\n"
- ".inst 0xa1ba27e2 // umopa za2.s, p1/M, p1/M, z31.b, z26.b\n"
- ".inst 0xa1be27e3 // umopa za3.s, p1/M, p1/M, z31.b, z30.b\n"
- "ld1b { z31.b }, p1/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa1b02420 // umopa za0.s, p1/M, p1/M, z1.b, z16.b\n"
- ".inst 0xa14186f2 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1b42421 // umopa za1.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa1b82422 // umopa za2.s, p1/M, p1/M, z1.b, z24.b\n"
- ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- "ld1b { z1.b }, p1/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa14286f0 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0xa1a324c0 // umopa za0.s, p1/M, p1/M, z6.b, z3.b\n"
- ".inst 0xa1a724c1 // umopa za1.s, p1/M, p1/M, z6.b, z7.b\n"
- ".inst 0xa1ab24c2 // umopa za2.s, p1/M, p1/M, z6.b, z11.b\n"
- ".inst 0xa1af24c3 // umopa za3.s, p1/M, p1/M, z6.b, z15.b\n"
- "ld1b { z6.b }, p1/Z, [x26, #3, MUL VL]\n"
- "addvl x26, x26, #4\n"
- ".inst 0xa14386e3 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa1a42680 // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1a52681 // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa1a62682 // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa1a72683 // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa1b82560 // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1b92561 // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1ba2562 // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa1bb2563 // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa1bc2440 // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1bd2441 // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa1be2442 // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa1bf2443 // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xa1b025c0 // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa1b125c1 // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa1b225c2 // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa1b325c3 // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa1a024a0 // umopa za0.s, p1/M, p1/M, z5.b, z0.b\n"
- ".inst 0xa1a424a1 // umopa za1.s, p1/M, p1/M, z5.b, z4.b\n"
- ".inst 0xa1a824a2 // umopa za2.s, p1/M, p1/M, z5.b, z8.b\n"
- ".inst 0xa1ac24a3 // umopa za3.s, p1/M, p1/M, z5.b, z12.b\n"
- ".inst 0xa1b227e0 // umopa za0.s, p1/M, p1/M, z31.b, z18.b\n"
- ".inst 0xa1b627e1 // umopa za1.s, p1/M, p1/M, z31.b, z22.b\n"
- ".inst 0xa1ba27e2 // umopa za2.s, p1/M, p1/M, z31.b, z26.b\n"
- ".inst 0xa1be27e3 // umopa za3.s, p1/M, p1/M, z31.b, z30.b\n"
- ".inst 0xa1b02420 // umopa za0.s, p1/M, p1/M, z1.b, z16.b\n"
- ".inst 0xa1b42421 // umopa za1.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa1b82422 // umopa za2.s, p1/M, p1/M, z1.b, z24.b\n"
- ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- ".inst 0xa1a324c0 // umopa za0.s, p1/M, p1/M, z6.b, z3.b\n"
- ".inst 0xa1a724c1 // umopa za1.s, p1/M, p1/M, z6.b, z7.b\n"
- ".inst 0xa1ab24c2 // umopa za2.s, p1/M, p1/M, z6.b, z11.b\n"
- ".inst 0xa1af24c3 // umopa za3.s, p1/M, p1/M, z6.b, z15.b\n"
+ ".inst 0xa1a42680 // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+ ".inst 0xa1a52681 // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa1a62682 // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa1a72683 // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ ".inst 0xa1b82560 // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa1b92561 // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1ba2562 // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa1bb2563 // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ ".inst 0xa1bc2440 // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa1bd2441 // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa1be2442 // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa1bf2443 // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ ".inst 0xa1b025c0 // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa1b125c1 // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa1b225c2 // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa1b325c3 // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1b { z16.b }, p1/Z, [x26]\n"
- "subs x20, x20, #0x1\n"
- "addvl x26, x26, #1\n"
+ "ld1b { z16.b }, p1/Z, [x25]\n"
+ "subs x21, x21, #0x1\n"
+ "addvl x25, x25, #1\n"
".inst 0xa04086e4 // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
".inst 0xa1a42600 // umopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
@@ -223,182 +222,182 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
".inst 0xa1a72603 // umopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- "ld1w { z15.s }, p1/Z, [x26]\n"
- "addvl x26, x26, #1\n"
+ "ld1w { z15.s }, p1/Z, [x25]\n"
+ "addvl x25, x25, #1\n"
".inst 0xc09125e0 // addva za0.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e1 // addva za1.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
- "tbz x15, #1, 14f\n"
- "tbz x15, #0, 12f\n"
+ "tbz x14, #1, 14f\n"
+ "tbz x14, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa040c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa060c5bc // st1w { z28.s-z31.s }, pn9.b, [x13]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa061c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
"addvl x13, x13, #16\n"
+ ".inst 0xa061c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa062c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
"blt 11b\n"
"b 21f\n"
"12:" // Store to partial result buffer: Store only
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa060c564 // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
- "addvl x13, x13, #16\n"
+ ".inst 0xa062c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
"blt 13b\n"
"b 21f\n"
"14:" // Store to output array
- "ldr x25, [%x[args], %[offsetof_C]]\n"
- "sub x24, x11, x10\n"
+ "ldr x24, [%x[args], %[offsetof_C]]\n"
+ "add x24, x24, x28\n" // C += n
+ "sub x23, x10, x9\n"
"ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x24, x9, x22, x24\n" // C += m * ldc
"ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "add x25, x25, x9\n" // C += n
"ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "madd x25, x10, x23, x25\n" // C += m * ldc
- "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
"ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
- "tbz x15, #2, 15f\n"
- "ldr w22, [%x[args], %[offsetof_n_0]]\n"
- "ldr x21, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "tbz x14, #2, 15f\n"
+ "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+ "add x21, x21, x28\n"
+ "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "add x22, x22, x9\n"
- "add x21, x21, x22, LSL #2\n"
- "add x20, x20, x22, LSL #2\n"
- ".inst 0xa040c2a4 // ld1w { z4.s-z7.s }, p8/Z, [x21]\n"
- ".inst 0xa040c280 // ld1w { z0.s-z3.s }, p8/Z, [x20]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x20\n"
- "whilelt p0.b, x9, x28\n"
- "cmp x24, x20\n"
- "mov x12, #0x0\n"
- "csel x20, x24, x20, LT\n"
+ "whilelt p0.b, x28, x27\n"
+ "cmp x23, x20\n"
+ "csel x20, x23, x20, LT\n"
"lsr x21, x20, #0x1\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x1\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860010 // mova { z16.s-z17.s }, za0h.s[x12, 0:1]\n"
- ".inst 0xc086005e // mova { z30.s-z31.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
- ".inst 0xc08600cc // mova { z12.s-z13.s }, za3h.s[x12, 0:1]\n"
- ".inst 0xc1a4a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z4.s\n"
- ".inst 0xc1a5a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z5.s\n"
+ ".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+ ".inst 0xc1a4a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+ ".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1a5a41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+ ".inst 0xc1a6a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
"add x12, x12, #0x2\n"
- ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
"cmp x12, x21, LSL #1\n"
- ".inst 0xc1a7a40c // sqdmulh { z12.s-z13.s }, { z12.s-z13.s }, z7.s\n"
- ".inst 0xc1a0a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
- ".inst 0xc1a1a23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z1.s\n"
- ".inst 0xc1a2a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z2.s\n"
- ".inst 0xc1a3a22c // srshl { z12.s-z13.s }, { z12.s-z13.s }, z3.s\n"
- ".inst 0xc1a8a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z8.s\n"
- ".inst 0xc1a8a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z8.s\n"
- ".inst 0xc1a8a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z8.s\n"
- ".inst 0xc1a8a30c // add { z12.s-z13.s }, { z12.s-z13.s }, z8.s\n"
- ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+ ".inst 0xc1a7a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+ ".inst 0xc1aca23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc1ada23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+ ".inst 0xc1aea236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ ".inst 0xc1afa230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+ ".inst 0xc1a0a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+ ".inst 0xc1a0a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6ac // sclamp { z12.s-z13.s }, z21.s, z20.s\n"
- "uzp1 z19.b, z16.b, z30.b\n"
- "uzp1 z18.b, z17.b, z31.b\n"
- "uzp1 z17.b, z26.b, z12.b\n"
- "uzp1 z16.b, z27.b, z13.b\n"
- "uzp1 z17.b, z19.b, z17.b\n"
- "uzp1 z16.b, z18.b, z16.b\n"
- "st1b { z17.b }, p0, [x25]\n"
- "add x25, x25, x23\n"
- "st1b { z16.b }, p0, [x25]\n"
- "add x25, x25, x23\n"
+ ".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+ "uzp1 z19.b, z26.b, z28.b\n"
+ ".inst 0xc1b4c6b6 // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z22.b, z16.b\n"
+ "uzp1 z18.b, z27.b, z29.b\n"
+ "uzp1 z17.b, z23.b, z17.b\n"
+ "uzp1 z16.b, z19.b, z16.b\n"
+ "st1b { z16.b }, p0, [x24]\n"
+ "add x24, x24, x22\n"
+ "uzp1 z16.b, z18.b, z17.b\n"
+ "st1b { z16.b }, p0, [x24]\n"
+ "add x24, x24, x22\n"
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
".inst 0xc086000a // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
- ".inst 0xc086005a // mova { z26.s-z27.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc086008e // mova { z14.s-z15.s }, za2h.s[x12, 0:1]\n"
- ".inst 0xc08600d6 // mova { z22.s-z23.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
".inst 0xc1a4a40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
- ".inst 0xc1a5a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z5.s\n"
- ".inst 0xc1a6a40e // sqdmulh { z14.s-z15.s }, { z14.s-z15.s }, z6.s\n"
- ".inst 0xc1a7a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z7.s\n"
- ".inst 0xc1a0a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
- ".inst 0xc1a1a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
- ".inst 0xc1a2a22e // srshl { z14.s-z15.s }, { z14.s-z15.s }, z2.s\n"
- ".inst 0xc1a3a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z3.s\n"
- ".inst 0xc1a8a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z8.s\n"
- ".inst 0xc1a8a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z8.s\n"
- ".inst 0xc1a8a30e // add { z14.s-z15.s }, { z14.s-z15.s }, z8.s\n"
- ".inst 0xc1a8a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z8.s\n"
+ ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600de // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1a5a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+ ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+ ".inst 0xc1a7a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+ ".inst 0xc1aca22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+ ".inst 0xc1ada238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+ ".inst 0xc1aea23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+ ".inst 0xc1afa23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+ ".inst 0xc1a0a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+ ".inst 0xc1a0a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+ "uzp1 z17.b, z10.b, z24.b\n"
".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6ae // sclamp { z14.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6b6 // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
- "uzp1 z17.b, z10.b, z26.b\n"
- "uzp1 z16.b, z14.b, z22.b\n"
+ ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z26.b, z30.b\n"
"uzp1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [x25]\n"
+ "st1b { z16.b }, p0, [x24]\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"19:" // Store to output array: End
- "tbz x15, #0, 21f\n"
+ "tbz x14, #0, 21f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"20:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14]\n"
- ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xa042c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xa043c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x14, x14, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x13, x13, #16\n"
"blt 20b\n"
"21:" // End block
- "incw x9, ALL, MUL #4\n"
- "cmp x9, x28\n"
+ "incw x28, ALL, MUL #4\n"
+ "cmp x28, x27\n"
"blt 3b\n"
- "incw x10\n"
- "mov x9, #0x0\n"
- "cmp x10, x11\n"
- "mov x27, x26\n"
+ "incw x9\n"
+ "cmp x9, x10\n"
+ "mov x28, #0x0\n"
+ "mov x26, x25\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
index a2621e85f4..8f5880bcea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
class cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL
{
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint8_t result_type;
typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
@@ -61,7 +60,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_2VLx2VL;
- StdTransformsSME<lhs_operand_type, result_type, 2, 2, 4, true> transforms = {};
+ StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
index 51b93d3636..8e8524a780 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
@@ -49,7 +49,7 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
C(C), ldcb(ldc * sizeof(uint8_t)),
M(M), N(N), K(K),
- min(0), max(0),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias), n_0(n_0),
accumulator_buffer(accumulator_buffer),
@@ -74,14 +74,13 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
const long kstride_bytes;
uint8_t *const C;
const long ldcb;
- const long M, N, K;
- int32_t min;
- int32_t max;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<uint8_t>::min();
+ int32_t max = std::numeric_limits<uint8_t>::max();
const int32_t *const bias;
const int n_0;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -100,17 +99,17 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -125,108 +124,108 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa10a4294 // ld1w { z20.s, z28.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc0902781 // addha za1.s, p1/M, p1/M, z28.s\n"
- ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc0902783 // addha za3.s, p1/M, p1/M, z28.s\n"
+ ".inst 0xa00a4299 // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
+ ".inst 0xc0902702 // addha za2.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902723 // addha za3.s, p1/M, p1/M, z25.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20, ALL, MUL #2\n"
"incw x21, ALL, MUL #2\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa0400778 // ld1b { z24.b-z25.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa14006f7 // ld1b { z23.b, z31.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1410776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04106fa // ld1b { z26.b-z27.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa0420766 // ld1b { z6.b-z7.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206e0 // ld1b { z0.b, z8.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa043077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04306ec // ld1b { z12.b-z13.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa1b72700 // umopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1bf2701 // umopa za1.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa1b72722 // umopa za2.s, p1/M, p1/M, z25.b, z23.b\n"
- ".inst 0xa1bf2723 // umopa za3.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa0400778 // ld1b { z24.b-z25.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa1ba26c0 // umopa za0.s, p1/M, p1/M, z22.b, z26.b\n"
- ".inst 0xa14006f7 // ld1b { z23.b, z31.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1bb26c1 // umopa za1.s, p1/M, p1/M, z22.b, z27.b\n"
- ".inst 0xa1ba27c2 // umopa za2.s, p1/M, p1/M, z30.b, z26.b\n"
- ".inst 0xa1bb27c3 // umopa za3.s, p1/M, p1/M, z30.b, z27.b\n"
- ".inst 0xa1410776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa1a024c0 // umopa za0.s, p1/M, p1/M, z6.b, z0.b\n"
- ".inst 0xa04106fa // ld1b { z26.b-z27.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa1a824c1 // umopa za1.s, p1/M, p1/M, z6.b, z8.b\n"
- ".inst 0xa1a024e2 // umopa za2.s, p1/M, p1/M, z7.b, z0.b\n"
- ".inst 0xa1a824e3 // umopa za3.s, p1/M, p1/M, z7.b, z8.b\n"
- ".inst 0xa0420766 // ld1b { z6.b-z7.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206e0 // ld1b { z0.b, z8.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1ac2780 // umopa za0.s, p1/M, p1/M, z28.b, z12.b\n"
- ".inst 0xa1ad2781 // umopa za1.s, p1/M, p1/M, z28.b, z13.b\n"
- ".inst 0xa1ac27a2 // umopa za2.s, p1/M, p1/M, z29.b, z12.b\n"
- ".inst 0xa1ad27a3 // umopa za3.s, p1/M, p1/M, z29.b, z13.b\n"
- ".inst 0xa043077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa1b12460 // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1b92461 // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa1b12562 // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa1b92563 // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa1b62680 // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1b72681 // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa1b62782 // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa1b72783 // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa1b026a0 // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa1b826a1 // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa1b027a2 // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa1b827a3 // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1a724a0 // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa1af24a1 // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa1a725a2 // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa1af25a3 // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa04306ec // ld1b { z12.b-z13.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa1b72700 // umopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
- ".inst 0xa1bf2701 // umopa za1.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa1b72722 // umopa za2.s, p1/M, p1/M, z25.b, z23.b\n"
- ".inst 0xa1bf2723 // umopa za3.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa1ba26c0 // umopa za0.s, p1/M, p1/M, z22.b, z26.b\n"
- ".inst 0xa1bb26c1 // umopa za1.s, p1/M, p1/M, z22.b, z27.b\n"
- ".inst 0xa1ba27c2 // umopa za2.s, p1/M, p1/M, z30.b, z26.b\n"
- ".inst 0xa1bb27c3 // umopa za3.s, p1/M, p1/M, z30.b, z27.b\n"
- ".inst 0xa1a024c0 // umopa za0.s, p1/M, p1/M, z6.b, z0.b\n"
- ".inst 0xa1a824c1 // umopa za1.s, p1/M, p1/M, z6.b, z8.b\n"
- ".inst 0xa1a024e2 // umopa za2.s, p1/M, p1/M, z7.b, z0.b\n"
- ".inst 0xa1a824e3 // umopa za3.s, p1/M, p1/M, z7.b, z8.b\n"
- ".inst 0xa1ac2780 // umopa za0.s, p1/M, p1/M, z28.b, z12.b\n"
- ".inst 0xa1ad2781 // umopa za1.s, p1/M, p1/M, z28.b, z13.b\n"
- ".inst 0xa1ac27a2 // umopa za2.s, p1/M, p1/M, z29.b, z12.b\n"
- ".inst 0xa1ad27a3 // umopa za3.s, p1/M, p1/M, z29.b, z13.b\n"
+ ".inst 0xa1b12460 // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+ ".inst 0xa1b92461 // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa1b12562 // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa1b92563 // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1b62680 // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa1b72681 // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa1b62782 // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa1b72783 // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa1b026a0 // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa1b826a1 // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa1b027a2 // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa1b827a3 // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa1a724a0 // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa1af24a1 // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa1a725a2 // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa1af25a3 // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa0400762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ ".inst 0xa1400773 // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
".inst 0xa04006f0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0xa1b02440 // umopa za0.s, p1/M, p1/M, z2.b, z16.b\n"
- ".inst 0xa1b12441 // umopa za1.s, p1/M, p1/M, z2.b, z17.b\n"
- ".inst 0xa1b02462 // umopa za2.s, p1/M, p1/M, z3.b, z16.b\n"
- ".inst 0xa1b12463 // umopa za3.s, p1/M, p1/M, z3.b, z17.b\n"
+ ".inst 0xa1b02660 // umopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+ ".inst 0xa1b12661 // umopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+ ".inst 0xa1b02762 // umopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+ ".inst 0xa1b12763 // umopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040476e // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
"addvl x27, x27, #2\n"
- ".inst 0xc09124e0 // addva za0.s, p1/M, p1/M, z7.s\n"
- ".inst 0xc09124e1 // addva za1.s, p1/M, p1/M, z7.s\n"
+ ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
"tbz x16, #1, 14f\n"
@@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa060c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -260,71 +259,71 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c5dc // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 24f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
- "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
- "ld1rw { z9.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z10.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "add x26, x26, x10\n" // C += n
- "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
- "ldr w22, [%x[args], %[offsetof_n_0]]\n"
- "ldr x21, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+ "add x21, x21, x10\n"
+ "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "add x22, x22, x10\n"
- "add x21, x21, x22, LSL #2\n"
- "add x20, x20, x22, LSL #2\n"
- ".inst 0xa04042a8 // ld1w { z8.s-z9.s }, p8/Z, [x21]\n"
- ".inst 0xa040428a // ld1w { z10.s-z11.s }, p8/Z, [x20]\n"
+ "add x20, x20, x21, LSL #2\n"
+ ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.h, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a9ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1aaaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1abaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z11.s\n"
- ".inst 0xc1afab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z15.s\n"
- ".inst 0xc1afab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z15.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cea0 // sclamp { z0.s-z3.s }, z21.s, z20.s\n"
- "uzp1 z19.h, z28.h, z0.h\n"
- "uzp1 z18.h, z29.h, z1.h\n"
- "uzp1 z17.h, z30.h, z2.h\n"
- "uzp1 z16.h, z31.h, z3.h\n"
- "st1b { z19.h }, p0, [x26]\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z8.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.h }, p0, [x26]\n"
+ "uzp1 z16.h, z5.h, z9.h\n"
+ "uzp1 z17.h, z6.h, z10.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "uzp1 z16.h, z7.h, z11.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -332,59 +331,60 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a9ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z9.s\n"
- ".inst 0xc1aaaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc1afab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z15.s\n"
- ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- "uzp1 z16.h, z4.h, z28.h\n"
+ ".inst 0xc1a2aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc1a3aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z8.h, z4.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
"subs x20, x20, #0x1\n"
- "uzp1 z16.h, z5.h, z29.h\n"
+ "uzp1 z16.h, z9.h, z5.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
- "uzp1 z16.h, z6.h, z30.h\n"
+ "uzp1 z16.h, z10.h, z6.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 22f\n"
+ "whilelt p0.h, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a9ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z9.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1aaaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z10.s\n"
- ".inst 0xc1abaa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1afab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z15.s\n"
- ".inst 0xc1afab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z15.s\n"
- ".inst 0xc1b4cea0 // sclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
- "uzp1 z19.h, z0.h, z28.h\n"
- "uzp1 z18.h, z1.h, z29.h\n"
- "uzp1 z17.h, z2.h, z30.h\n"
- "uzp1 z16.h, z3.h, z31.h\n"
- "st1b { z19.h }, p0, [x26]\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z20.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.h }, p0, [x26]\n"
+ "uzp1 z16.h, z5.h, z21.h\n"
+ "uzp1 z17.h, z6.h, z22.h\n"
+ "st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
+ "uzp1 z16.h, z7.h, z23.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -394,15 +394,15 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"cbz x20, 21f\n"
".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a9ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- ".inst 0xc1aaaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc1afab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z15.s\n"
- ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
"uzp1 z16.h, z4.h, z16.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
@@ -420,25 +420,25 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"23:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
- ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 23b\n"
"24:" // End block
"incw x10, ALL, MUL #2\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #2\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
index dbf62cbb8a..0665468517 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
@@ -37,8 +37,7 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
class cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL
{
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint8_t result_type;
typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
@@ -61,7 +60,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -82,7 +81,7 @@ public:
// Default to the generic kernel
kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_4VLx1VL;
- StdTransformsSME<lhs_operand_type, result_type, 4, 1, 4, true> transforms = {};
+ StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
index 775a3bf3d2..2239b3f1be 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
@@ -49,7 +49,7 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
C(C), ldcb(ldc * sizeof(uint8_t)),
M(M), N(N), K(K),
- min(0), max(0),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
bias(bias), n_0(n_0),
accumulator_buffer(accumulator_buffer),
@@ -74,14 +74,13 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
const long kstride_bytes;
uint8_t *const C;
const long ldcb;
- const long M, N, K;
- int32_t min;
- int32_t max;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<uint8_t>::min();
+ int32_t max = std::numeric_limits<uint8_t>::max();
const int32_t *const bias;
const int n_0;
-
int32_t *const accumulator_buffer;
uint64_t flags;
};
@@ -100,17 +99,17 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 1b\n"
"2:" // Initial accumulator load from buffer: End
"ldr w13, [%x[args], %[offsetof_M]]\n"
@@ -125,95 +124,95 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "ld1w { z6.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc09024c0 // addha za0.s, p1/M, p1/M, z6.s\n"
- ".inst 0xc09024c1 // addha za1.s, p1/M, p1/M, z6.s\n"
- ".inst 0xc09024c2 // addha za2.s, p1/M, p1/M, z6.s\n"
- ".inst 0xc09024c3 // addha za3.s, p1/M, p1/M, z6.s\n"
+ "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902500 // addha za0.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902501 // addha za1.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902502 // addha za2.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902503 // addha za3.s, p1/M, p1/M, z8.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
"incw x20\n"
"incw x21, ALL, MUL #4\n"
"cmp x20, x9\n"
- "mov x20, x16\n"
"csel x21, x11, x21, LT\n"
+ "mov x20, x16\n"
"bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n"
"cmp x21, x13\n"
"csel x16, x20, x16, LT\n"
"5:" // Prepare accumulators: End
"ldr x20, [%x[args], %[offsetof_K]]\n"
- "ldr x23, [%x[args], %[offsetof_B]]\n"
- "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
"add x20, x20, #0x3\n"
"lsr x20, x20, #0x2\n"
- "lsr x21, x20, #0x2\n"
- "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes
- "and x20, x20, #0x3\n"
- "cbz x21, 8f\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1408360 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn8.b/Z, [x27]\n"
- "ld1b { z29.b }, p1/Z, [x23]\n"
- ".inst 0xa1418361 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ "ldr x23, [%x[args], %[offsetof_B]]\n"
+ "lsr x22, x20, #0x2\n"
+ "and x21, x20, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1b { z31.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa1bd2400 // umopa za0.s, p1/M, p1/M, z0.b, z29.b\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa1bd2481 // umopa za1.s, p1/M, p1/M, z4.b, z29.b\n"
- ".inst 0xa1bd2502 // umopa za2.s, p1/M, p1/M, z8.b, z29.b\n"
- ".inst 0xa1bd2583 // umopa za3.s, p1/M, p1/M, z12.b, z29.b\n"
- ".inst 0xa1408360 // ld1b { z0.b, z4.b, z8.b, z12.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa1b32420 // umopa za0.s, p1/M, p1/M, z1.b, z19.b\n"
- "ld1b { z29.b }, p1/Z, [x23]\n"
- ".inst 0xa1b324a1 // umopa za1.s, p1/M, p1/M, z5.b, z19.b\n"
- ".inst 0xa1b32522 // umopa za2.s, p1/M, p1/M, z9.b, z19.b\n"
- ".inst 0xa1b325a3 // umopa za3.s, p1/M, p1/M, z13.b, z19.b\n"
- ".inst 0xa1418361 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa1b42460 // umopa za0.s, p1/M, p1/M, z3.b, z20.b\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1b424e1 // umopa za1.s, p1/M, p1/M, z7.b, z20.b\n"
- ".inst 0xa1b42562 // umopa za2.s, p1/M, p1/M, z11.b, z20.b\n"
- ".inst 0xa1b425e3 // umopa za3.s, p1/M, p1/M, z15.b, z20.b\n"
- ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa1bf2700 // umopa za0.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa1bf2721 // umopa za1.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa1bf2742 // umopa za2.s, p1/M, p1/M, z26.b, z31.b\n"
- ".inst 0xa1bf2763 // umopa za3.s, p1/M, p1/M, z27.b, z31.b\n"
- ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa1ae2480 // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1ae24a1 // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa1ae24c2 // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa1ae24e3 // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa1bf2680 // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa1bf26a1 // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa1bf26c2 // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa1bf26e3 // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa1ad2700 // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa1ad2721 // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa1ad2742 // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa1ad2763 // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa1bd2500 // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa1bd2521 // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa1bd2542 // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1bd2563 // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ld1b { z31.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa1bd2400 // umopa za0.s, p1/M, p1/M, z0.b, z29.b\n"
- ".inst 0xa1bd2481 // umopa za1.s, p1/M, p1/M, z4.b, z29.b\n"
- ".inst 0xa1bd2502 // umopa za2.s, p1/M, p1/M, z8.b, z29.b\n"
- ".inst 0xa1bd2583 // umopa za3.s, p1/M, p1/M, z12.b, z29.b\n"
- ".inst 0xa1b32420 // umopa za0.s, p1/M, p1/M, z1.b, z19.b\n"
- ".inst 0xa1b324a1 // umopa za1.s, p1/M, p1/M, z5.b, z19.b\n"
- ".inst 0xa1b32522 // umopa za2.s, p1/M, p1/M, z9.b, z19.b\n"
- ".inst 0xa1b325a3 // umopa za3.s, p1/M, p1/M, z13.b, z19.b\n"
- ".inst 0xa1b42460 // umopa za0.s, p1/M, p1/M, z3.b, z20.b\n"
- ".inst 0xa1b424e1 // umopa za1.s, p1/M, p1/M, z7.b, z20.b\n"
- ".inst 0xa1b42562 // umopa za2.s, p1/M, p1/M, z11.b, z20.b\n"
- ".inst 0xa1b425e3 // umopa za3.s, p1/M, p1/M, z15.b, z20.b\n"
- ".inst 0xa1bf2700 // umopa za0.s, p1/M, p1/M, z24.b, z31.b\n"
- ".inst 0xa1bf2721 // umopa za1.s, p1/M, p1/M, z25.b, z31.b\n"
- ".inst 0xa1bf2742 // umopa za2.s, p1/M, p1/M, z26.b, z31.b\n"
- ".inst 0xa1bf2763 // umopa za3.s, p1/M, p1/M, z27.b, z31.b\n"
+ ".inst 0xa1ae2480 // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+ ".inst 0xa1ae24a1 // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa1ae24c2 // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa1ae24e3 // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa1bf2680 // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ ".inst 0xa1bf26a1 // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa1bf26c2 // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa1bf26e3 // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa1ad2700 // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ ".inst 0xa1ad2721 // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa1ad2742 // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa1ad2763 // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa1bd2500 // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa1bd2521 // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa1bd2542 // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1bd2563 // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
"8:" // K oddments
- "cbz x20, 10f\n"
+ "cbz x21, 10f\n"
"9:" // K oddments: Loop
".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
"ld1b { z15.b }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
@@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xa043c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa060c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14]\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa061c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
- ".inst 0xa062c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 30f\n"
@@ -260,56 +259,56 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa060c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
- ".inst 0xa061c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"cmp x12, x20\n"
- ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
"14:" // Store to output array
"ldr x26, [%x[args], %[offsetof_C]]\n"
+ "add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
"ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x26, x11, x24, x26\n" // C += m * ldc
"ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
- "add x26, x26, x10\n" // C += n
- "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
- "madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
- "ldr w22, [%x[args], %[offsetof_n_0]]\n"
- "ldr x21, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+ "add x21, x21, x10\n"
+ "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "add x22, x22, x10\n"
- "add x21, x21, x22, LSL #2\n"
- "add x20, x20, x22, LSL #2\n"
- "ld1w { z2.s }, p0/Z, [x21]\n"
+ "add x20, x20, x21, LSL #2\n"
"ld1w { z1.s }, p0/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "add x12, x12, #0x4\n"
".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
- "cmp x12, x21, LSL #2\n"
+ "add x12, x12, #0x4\n"
".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ "cmp x12, x21, LSL #2\n"
".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z17.s }, p0, [x26]\n"
@@ -321,55 +320,56 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
- ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
- "st1b { z4.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
"subs x20, x20, #0x1\n"
- "st1b { z5.s }, p0, [x26]\n"
+ "st1b { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
- "st1b { z6.s }, p0, [x26]\n"
+ "st1b { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
"beq 28f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z19.s }, p0, [x26]\n"
+ "st1b { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 19b\n"
"20:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 21f\n"
".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- "subs x20, x20, #0x1\n"
".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ "subs x20, x20, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
- ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
"st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
@@ -382,113 +382,115 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"21:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
"beq 28f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
- ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
- "st1b { z12.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1b4cea8 // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ "st1b { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z13.s }, p0, [x26]\n"
+ "st1b { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z14.s }, p0, [x26]\n"
+ "st1b { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z15.s }, p0, [x26]\n"
+ "st1b { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 24f\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ "st1b { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
"subs x20, x20, #0x1\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z13.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z14.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"24:" // Store to output array: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
"beq 28f\n"
+ "whilelt p0.s, x10, x9\n"
"cmp x25, x23\n"
- "mov x12, #0x0\n"
"csel x20, x25, x23, LT\n"
"lsr x21, x20, #0x2\n"
+ "mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 26f\n"
"25:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z19.s }, p0, [x26]\n"
+ "st1b { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 25b\n"
"26:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 27f\n"
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
- ".inst 0xc1a1aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
- ".inst 0xc1a0ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
- ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
- "st1b { z20.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
"subs x20, x20, #0x1\n"
- "st1b { z21.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
- "st1b { z22.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"27:" // Store to output array: Accumulator row 3 oddments: End
"28:" // Store to output array: End
"tbz x16, #0, 30f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- "addvl x15, x15, #16\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
+ "addvl x15, x15, #16\n"
"blt 29b\n"
"30:" // End block
"incw x10\n"
"cmp x10, x9\n"
"blt 3b\n"
"incw x11, ALL, MUL #4\n"
- "mov x10, #0x0\n"
"cmp x11, x13\n"
+ "mov x10, #0x0\n"
"mov x28, x27\n"
"blt 3b\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
index ac21a980d3..4cf20bef91 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
@@ -82,7 +82,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
index 61c38db3cb..7fc723ecad 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -50,19 +50,18 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -83,7 +82,6 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -109,15 +107,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"bgt 29f\n"
"beq 15f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -125,12 +123,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 3f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 3f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 3f\n"
"mov x11, x12\n"
"3:" // Height 1: B setup done
@@ -145,26 +143,26 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cbz x15, 4f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x15, x15, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"b 6f\n"
"4:" // Height 1: no bias
"tbz %x[flags], #0, 5f\n"
- "ld1w { z19.s }, p4/Z, [x13]\n"
+ "ld1w { z16.s }, p4/Z, [x13]\n"
"ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "zip1 z8.d, z16.d, z12.d\n"
+ "zip2 z12.d, z16.d, z12.d\n"
"ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "zip1 z8.d, z19.d, z12.d\n"
- "zip2 z12.d, z19.d, z12.d\n"
"zip1 z9.d, z18.d, z13.d\n"
"zip2 z13.d, z18.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
@@ -185,8 +183,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -202,96 +200,96 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z20.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "trn1 z19.d, z20.d, z18.d\n"
- "trn2 z20.d, z20.d, z18.d\n"
- ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
- "ld1h { z1.h }, p5/Z, [x11]\n"
- ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
"ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6461e669 // bfmmla z9.s, z19.h, z1.h\n"
- "ld1h { z18.h }, p5/Z, [x10]\n"
- ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6472e66a // bfmmla z10.s, z19.h, z18.h\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x9]\n"
- ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
- ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
- ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6470e68a // bfmmla z10.s, z20.h, z16.h\n"
+ ".inst 0x6471e68e // bfmmla z14.s, z20.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
+ "add x26, x26, #0x10\n"
+ "addvl x12, x12, #4\n"
+ "addvl x11, x11, #4\n"
+ "addvl x10, x10, #4\n"
+ "addvl x9, x9, #4\n"
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x12]\n"
- "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "addvl x12, x12, #2\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x11]\n"
".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
"ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x9]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
"ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
+ "subs x27, x27, #0x4\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
"ble 12f\n"
- "ld1h { z16.h }, p5/Z, [x12]\n"
- "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n"
+ ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
"addvl x12, x12, #2\n"
- ".inst 0x6470e428 // bfmmla z8.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x11]\n"
- ".inst 0x6471e42c // bfmmla z12.s, z1.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x11, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6470e429 // bfmmla z9.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10]\n"
- ".inst 0x6471e42d // bfmmla z13.s, z1.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6470e42a // bfmmla z10.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x9]\n"
- ".inst 0x6471e42e // bfmmla z14.s, z1.h, z17.h\n"
- "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x9, x9, #2\n"
- ".inst 0x6470e42b // bfmmla z11.s, z1.h, z16.h\n"
- ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n"
"12:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -302,14 +300,14 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z10.d, z10.d, z14.d\n"
"uzp1 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 13f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z17.s\n"
- "fmin z9.s, p5/M, z9.s, z17.s\n"
- "fmin z10.s, p5/M, z10.s, z17.s\n"
- "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
"fmax z8.s, p5/M, z8.s, z16.s\n"
"fmax z9.s, p5/M, z9.s, z16.s\n"
"fmax z10.s, p5/M, z10.s, z16.s\n"
@@ -327,15 +325,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 86f\n"
"15:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -343,12 +341,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 17f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 17f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 17f\n"
"mov x11, x12\n"
"17:" // Height 2: B setup done
@@ -363,38 +361,38 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cbz x15, 18f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x15, x15, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"b 20f\n"
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z18.s }, p4/Z, [x13]\n"
- "ld1w { z16.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z17.s }, p1/Z, [x13, #3, MUL VL]\n"
"add x20, x13, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
+ "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z8.d, z18.d, z12.d\n"
- "zip2 z12.d, z18.d, z12.d\n"
- "zip1 z9.d, z16.d, z13.d\n"
- "zip2 z13.d, z16.d, z13.d\n"
- "zip1 z10.d, z5.d, z14.d\n"
- "zip2 z14.d, z5.d, z14.d\n"
- "zip1 z11.d, z17.d, z15.d\n"
- "zip2 z15.d, z17.d, z15.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
+ "zip2 z15.d, z16.d, z15.d\n"
"b 20f\n"
"19:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -409,8 +407,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -429,99 +427,99 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "ld1rqh { z19.h }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
- "ld1h { z20.h }, p5/Z, [x12, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z19.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z16.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "trn1 z18.d, z19.d, z16.d\n"
- "trn2 z19.d, z19.d, z16.d\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x11]\n"
- ".inst 0x6474e64c // bfmmla z12.s, z18.h, z20.h\n"
"ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x9]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
"ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
- ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
+ ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
+ ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
- ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n"
+ ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
+ ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
- ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n"
+ ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n"
+ ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
+ ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
+ ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "addvl x12, x12, #4\n"
+ "addvl x11, x11, #4\n"
+ "addvl x10, x10, #4\n"
"addvl x9, x9, #4\n"
- ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
- ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x12]\n"
- "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "addvl x12, x12, #2\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
"ld1rqh { z19.h }, p0/Z, [x25]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x11]\n"
".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
"ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x9]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
"ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
+ "subs x27, x27, #0x4\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
- "ble 26f\n"
- "ld1h { z16.h }, p5/Z, [x12]\n"
- "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6470e428 // bfmmla z8.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x11]\n"
- ".inst 0x6471e42c // bfmmla z12.s, z1.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x11, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6470e429 // bfmmla z9.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10]\n"
- ".inst 0x6471e42d // bfmmla z13.s, z1.h, z17.h\n"
- "ld1h { z26.h }, p5/Z, [x10, #1, MUL VL]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6470e42a // bfmmla z10.s, z1.h, z16.h\n"
- "ld1h { z17.h }, p5/Z, [x9]\n"
- ".inst 0x647ae42e // bfmmla z14.s, z1.h, z26.h\n"
- "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x9, x9, #2\n"
- ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n"
+ "ble 26f\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z22.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6476e42b // bfmmla z11.s, z1.h, z22.h\n"
".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
"26:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -530,17 +528,17 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z7.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
+ "add x25, x13, x20, LSL #2\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z17.s\n"
"fmin z12.s, p5/M, z12.s, z17.s\n"
@@ -564,10 +562,10 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
"28:" // Height 2: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -575,15 +573,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 86f\n"
"29:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -591,12 +589,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 31f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 31f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 31f\n"
"mov x11, x12\n"
"31:" // Height 3: B setup done
@@ -611,15 +609,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cbz x15, 32f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x15, x15, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -634,36 +632,36 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z26.s }, p4/Z, [x13]\n"
- "ld1w { z25.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"add x21, x13, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
+ "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
- "zip1 z8.d, z26.d, z12.d\n"
- "zip2 z12.d, z26.d, z12.d\n"
- "ld1w { z2.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z9.d, z25.d, z13.d\n"
- "zip2 z13.d, z25.d, z13.d\n"
- "zip1 z10.d, z24.d, z14.d\n"
- "zip2 z14.d, z24.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "zip1 z19.d, z2.d, z23.d\n"
- "zip2 z23.d, z2.d, z23.d\n"
+ "zip1 z19.d, z24.d, z23.d\n"
+ "zip2 z23.d, z24.d, z23.d\n"
"b 34f\n"
"33:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -686,8 +684,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -709,130 +707,130 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x12]\n"
- "ld1h { z30.h }, p5/Z, [x12, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z29.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
"ld1rqh { z24.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
"ld1rqh { z28.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "trn1 z27.d, z29.d, z24.d\n"
- "trn2 z29.d, z29.d, z24.d\n"
- "trn1 z26.d, z28.d, z31.d\n"
- "trn2 z28.d, z28.d, z31.d\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
- ".inst 0x647ee76c // bfmmla z12.s, z27.h, z30.h\n"
".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x11]\n"
- ".inst 0x647ee754 // bfmmla z20.s, z26.h, z30.h\n"
"ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9]\n"
+ "cmp x27, #0x8\n"
".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "add x26, x26, #0x10\n"
".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
- ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
- ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ "addvl x12, x12, #4\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
- ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x12]\n"
- "ld1h { z28.h }, p5/Z, [x12, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "addvl x12, x12, #2\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
"ld1rqh { z24.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
"trn1 z27.d, z1.d, z24.d\n"
"trn2 z1.d, z1.d, z24.d\n"
- "trn1 z26.d, z3.d, z29.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- "trn2 z3.d, z3.d, z29.d\n"
".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x11]\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
"ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x4\n"
".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
+ "trn2 z3.d, z3.d, z28.d\n"
".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9]\n"
+ "addvl x12, x12, #2\n"
".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
+ "addvl x11, x11, #2\n"
".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 40f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
- "ld1h { z25.h }, p5/Z, [x11]\n"
".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
"ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
+ "addvl x12, x12, #2\n"
".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n"
".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
+ "addvl x11, x11, #2\n"
".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n"
".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9]\n"
+ "addvl x10, x10, #2\n"
".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n"
".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
@@ -847,24 +845,24 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x13, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 z16.d, z16.d, z20.d\n"
"uzp1 z17.d, z17.d, z21.d\n"
"uzp1 z18.d, z18.d, z22.d\n"
"uzp1 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 41f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p5/Z, [x21]\n"
"ld1rw { z24.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z25.s\n"
"fmin z12.s, p5/M, z12.s, z25.s\n"
@@ -896,14 +894,14 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
"42:" // Height 3: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -911,15 +909,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 86f\n"
"43:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -927,12 +925,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 45f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 45f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 45f\n"
"mov x11, x12\n"
"45:" // Height 4: B setup done
@@ -947,15 +945,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cbz x15, 46f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x15, x15, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -970,37 +968,37 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x13]\n"
- "ld1w { z22.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"add x22, x13, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x20]\n"
- "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -1027,8 +1025,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1053,133 +1051,133 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z31.h }, p5/Z, [x12]\n"
- "ld1h { z30.h }, p5/Z, [x12, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z29.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z25.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
"ld1rqh { z28.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqh { z24.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "trn1 z27.d, z29.d, z25.d\n"
- "trn2 z29.d, z29.d, z25.d\n"
- "trn1 z26.d, z28.d, z24.d\n"
- "trn2 z28.d, z28.d, z24.d\n"
- ".inst 0x647fe768 // bfmmla z8.s, z27.h, z31.h\n"
- ".inst 0x647ee76c // bfmmla z12.s, z27.h, z30.h\n"
- ".inst 0x647fe750 // bfmmla z16.s, z26.h, z31.h\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x11]\n"
- ".inst 0x647ee754 // bfmmla z20.s, z26.h, z30.h\n"
"ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ "sub x27, x27, #0x8\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ "cmp x27, #0x8\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
- ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
- ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ "addvl x12, x12, #4\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ "addvl x11, x11, #4\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
- ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x12]\n"
- "ld1h { z28.h }, p5/Z, [x12, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "addvl x12, x12, #2\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z25.h }, p0/Z, [x25]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z24.h }, p0/Z, [x23]\n"
- "trn1 z27.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- "trn1 z26.d, z3.d, z24.d\n"
- ".inst 0x647de768 // bfmmla z8.s, z27.h, z29.h\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- "trn2 z3.d, z3.d, z24.d\n"
- ".inst 0x647de750 // bfmmla z16.s, z26.h, z29.h\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x11]\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
"ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ "subs x27, x27, #0x4\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ "addvl x12, x12, #2\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
- "ld1h { z25.h }, p5/Z, [x11]\n"
".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
"ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
+ "addvl x12, x12, #2\n"
".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n"
".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
+ "addvl x11, x11, #2\n"
".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n"
".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x9]\n"
+ "addvl x10, x10, #2\n"
".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n"
".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
@@ -1194,17 +1192,17 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
@@ -1214,9 +1212,9 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 55f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z24.s }, p5/Z, [x21]\n"
"ld1rw { z23.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z24.s\n"
"fmin z12.s, p5/M, z12.s, z24.s\n"
@@ -1256,18 +1254,18 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
"56:" // Height 4: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1275,15 +1273,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 86f\n"
"57:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"58:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -1291,12 +1289,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 59f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 59f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 59f\n"
"mov x11, x12\n"
"59:" // Height 5: B setup done
@@ -1311,15 +1309,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cbz x15, 60f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x15, x15, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -1342,46 +1340,46 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"60:" // Height 5: no bias
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x13]\n"
- "ld1w { z22.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"add x23, x13, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x21]\n"
- "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x20]\n"
- "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
@@ -1420,8 +1418,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"63:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1449,120 +1447,120 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 67f\n"
"66:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x12]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z6.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x25]\n"
"ld1rqh { z7.h }, p0/Z, [x24]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
"trn1 z3.d, z7.d, z2.d\n"
"trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z1.h }, p5/Z, [x12]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
"ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6461e488 // bfmmla z8.s, z4.h, z1.h\n"
+ ".inst 0x6461e4a8 // bfmmla z8.s, z5.h, z1.h\n"
".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n"
".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x11]\n"
- ".inst 0x6460e48c // bfmmla z12.s, z4.h, z0.h\n"
+ "sub x27, x27, #0x8\n"
+ ".inst 0x6460e4ac // bfmmla z12.s, z5.h, z0.h\n"
".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6461e489 // bfmmla z9.s, z4.h, z1.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10]\n"
- ".inst 0x6460e48d // bfmmla z13.s, z4.h, z0.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n"
".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6461e48a // bfmmla z10.s, z4.h, z1.h\n"
+ ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n"
".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x9]\n"
- ".inst 0x6460e48e // bfmmla z14.s, z4.h, z0.h\n"
+ ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n"
".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6461e48b // bfmmla z11.s, z4.h, z1.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
- ".inst 0x6460e48f // bfmmla z15.s, z4.h, z0.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ "addvl x12, x12, #4\n"
".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n"
- ".inst 0x6461e4b8 // bfmmla z24.s, z5.h, z1.h\n"
+ ".inst 0x6461e498 // bfmmla z24.s, z4.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n"
- ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ "addvl x11, x11, #4\n"
".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n"
- ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ "addvl x10, x10, #4\n"
".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n"
- ".inst 0x6461e4ba // bfmmla z26.s, z5.h, z1.h\n"
+ ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n"
- ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
+ "addvl x9, x9, #4\n"
".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n"
- ".inst 0x6461e4bb // bfmmla z27.s, z5.h, z1.h\n"
+ ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n"
- ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 66b\n"
"67:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z2.h }, p5/Z, [x12]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z6.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1h { z2.h }, p5/Z, [x12]\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
"ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x11]\n"
+ "subs x27, x27, #0x4\n"
".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
+ "addvl x12, x12, #2\n"
".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n"
+ "addvl x11, x11, #2\n"
".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n"
".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x10]\n"
@@ -1570,8 +1568,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n"
".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n"
+ "addvl x10, x10, #2\n"
".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x9]\n"
@@ -1579,8 +1577,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
+ "addvl x9, x9, #2\n"
".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
@@ -1589,12 +1587,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 68f\n"
"ld1h { z2.h }, p5/Z, [x12]\n"
"ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x11]\n"
".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x11]\n"
+ "addvl x12, x12, #2\n"
".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
@@ -1602,8 +1600,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10]\n"
".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
@@ -1611,8 +1609,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x9]\n"
".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x9]\n"
".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
@@ -1629,20 +1627,20 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 63b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
@@ -1654,9 +1652,9 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 69f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x21]\n"
"ld1rw { z23.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z0.s\n"
"fmin z12.s, p5/M, z12.s, z0.s\n"
@@ -1704,22 +1702,22 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
"70:" // Height 5: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1727,19 +1725,18 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 86f\n"
"71:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x18\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x18\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"72:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -1747,12 +1744,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 73f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 73f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 73f\n"
"mov x11, x12\n"
"73:" // Height 6: B setup done
@@ -1767,15 +1764,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cbz x15, 74f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x15, x15, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -1798,54 +1795,54 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"74:" // Height 6: no bias
"tbz %x[flags], #0, 75f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x13]\n"
- "ld1w { z22.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
"add x24, x13, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x13]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x13, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z28.s }, p4/Z, [x20]\n"
- "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
@@ -1881,8 +1878,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"77:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 78f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1913,123 +1910,123 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 81f\n"
"80:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x12]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z6.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqh { z7.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqh { z2.h }, p0/Z, [x23]\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
"ld1rqh { z0.h }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "add x21, x21, #0x10\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1h { z1.h }, p5/Z, [x12]\n"
"ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6461e488 // bfmmla z8.s, z4.h, z1.h\n"
- ".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x11]\n"
- ".inst 0x6460e48c // bfmmla z12.s, z4.h, z0.h\n"
- ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ "sub x27, x27, #0x8\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6461e489 // bfmmla z9.s, z4.h, z1.h\n"
- ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10]\n"
- ".inst 0x6460e48d // bfmmla z13.s, z4.h, z0.h\n"
- ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6461e48a // bfmmla z10.s, z4.h, z1.h\n"
- ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x9]\n"
- ".inst 0x6460e48e // bfmmla z14.s, z4.h, z0.h\n"
- ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6461e48b // bfmmla z11.s, z4.h, z1.h\n"
- ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
- ".inst 0x6460e48f // bfmmla z15.s, z4.h, z0.h\n"
- ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
"addvl x12, x12, #4\n"
- ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
- ".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n"
- ".inst 0x6461e4b8 // bfmmla z24.s, z5.h, z1.h\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
- ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
- ".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n"
- ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
"addvl x11, x11, #4\n"
- ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
- ".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n"
- ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
"addvl x10, x10, #4\n"
- ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
- ".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n"
- ".inst 0x6461e4ba // bfmmla z26.s, z5.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
- ".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n"
- ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
"addvl x9, x9, #4\n"
- ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
- ".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n"
- ".inst 0x6461e4bb // bfmmla z27.s, z5.h, z1.h\n"
- ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
- ".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n"
- ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"bgt 80b\n"
"81:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z2.h }, p5/Z, [x12]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z6.h }, p0/Z, [x25]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
"ld1rqh { z0.h }, p0/Z, [x21]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z2.h }, p5/Z, [x12]\n"
"ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x11]\n"
+ "subs x27, x27, #0x4\n"
".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
+ "addvl x12, x12, #2\n"
".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n"
+ "addvl x11, x11, #2\n"
".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n"
".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x10]\n"
@@ -2037,8 +2034,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n"
".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n"
+ "addvl x10, x10, #2\n"
".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x9]\n"
@@ -2046,8 +2043,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
+ "addvl x9, x9, #2\n"
".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
@@ -2056,12 +2053,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 82f\n"
"ld1h { z2.h }, p5/Z, [x12]\n"
"ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x11]\n"
".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x11]\n"
+ "addvl x12, x12, #2\n"
".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
@@ -2069,8 +2066,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10]\n"
".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
@@ -2078,8 +2075,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x9]\n"
".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x9]\n"
".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
@@ -2096,21 +2093,21 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 77b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
@@ -2126,9 +2123,9 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 83f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p5/Z, [x21]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z1.s\n"
"fmin z12.s, p5/M, z12.s, z1.s\n"
@@ -2184,26 +2181,26 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z23.s }, p4, [x23]\n"
- "st1w { z28.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z30.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x22]\n"
- "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z23.s }, p4, [x22]\n"
+ "st1w { z28.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z29.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z30.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
"84:" // Height 6: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -2220,8 +2217,8 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"86:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
index 44b766c2d5..8e83f1cb2c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
@@ -82,7 +82,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
index 406f20bad7..ffad168b44 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -49,19 +49,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -82,7 +81,6 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -108,15 +106,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"bgt 27f\n"
"beq 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -124,12 +122,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 3f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 3f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 3f\n"
"mov x11, x12\n"
"3:" // Height 1: B setup done
@@ -164,8 +162,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -183,41 +181,41 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"ld1h { z7.h }, p4/Z, [x11]\n"
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
- "ld1h { z17.h }, p4/Z, [x10]\n"
- "addvl x11, x11, #1\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z17.h }, p4/Z, [x10]\n"
"ld1h { z16.h }, p4/Z, [x9]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
"add x26, x26, #0x2\n"
"subs x27, x27, #0x1\n"
+ "ld1rh { z0.h }, p4/Z, [x26]\n"
+ "ld1h { z6.h }, p4/Z, [x12]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, p4/M, z17.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x12]\n"
- "fmla z11.h, p4/M, z16.h, z0.h\n"
- "ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z7.h }, p4/Z, [x11]\n"
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
- "ld1h { z17.h }, p4/Z, [x10]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z17.h }, p4/Z, [x10]\n"
"ld1h { z16.h }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "cmp x28, x20\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, p4/M, z17.h, z0.h\n"
- "fmla z11.h, p4/M, z16.h, z0.h\n"
"bne 7b\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p4/Z, [x21]\n"
"ld1rh { z16.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z17.h\n"
"fmin z9.h, p4/M, z9.h, z17.h\n"
@@ -240,15 +238,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 80f\n"
"14:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -256,12 +254,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 16f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 16f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 16f\n"
"mov x11, x12\n"
"16:" // Height 2: B setup done
@@ -276,22 +274,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"cbz x15, 17f\n"
"ld1h { z8.h }, p4/Z, [x15]\n"
"ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x15, x15, #4\n"
"b 19f\n"
"17:" // Height 2: no bias
"tbz %x[flags], #0, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x13, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "add x20, x13, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x20]\n"
"ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
"ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
@@ -310,8 +308,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"20:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 21f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -333,26 +331,26 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"ld1h { z7.h }, p4/Z, [x11]\n"
"ble 24f\n"
"23:" // Height 2: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
"ld1h { z17.h }, p4/Z, [x10]\n"
- "addvl x11, x11, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"ld1h { z16.h }, p4/Z, [x9]\n"
+ "addvl x11, x11, #1\n"
"add x26, x26, #0x2\n"
"subs x27, x27, #0x1\n"
- "add x25, x25, #0x2\n"
- "addvl x10, x10, #1\n"
"fmla z10.h, p4/M, z17.h, z0.h\n"
"fmla z14.h, p4/M, z17.h, z1.h\n"
- "addvl x9, x9, #1\n"
- "ld1h { z6.h }, p4/Z, [x12]\n"
+ "add x25, x25, #0x2\n"
"fmla z11.h, p4/M, z16.h, z0.h\n"
- "ld1rh { z0.h }, p4/Z, [x26]\n"
"fmla z15.h, p4/M, z16.h, z1.h\n"
+ "addvl x10, x10, #1\n"
+ "ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
+ "addvl x9, x9, #1\n"
+ "ld1h { z6.h }, p4/Z, [x12]\n"
"ld1h { z7.h }, p4/Z, [x11]\n"
"bgt 23b\n"
"24:" // Height 2: Multiply loop: Main loop skip
@@ -364,22 +362,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z13.h, p4/M, z7.h, z1.h\n"
"ld1h { z16.h }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"cmp x28, x20\n"
- "addvl x10, x10, #1\n"
"fmla z10.h, p4/M, z17.h, z0.h\n"
"fmla z14.h, p4/M, z17.h, z1.h\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z11.h, p4/M, z16.h, z0.h\n"
"fmla z15.h, p4/M, z16.h, z1.h\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"bne 20b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p4/Z, [x21]\n"
"ld1rh { z16.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z17.h\n"
"fmin z9.h, p4/M, z9.h, z17.h\n"
@@ -403,10 +401,10 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -414,15 +412,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 80f\n"
"27:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -430,12 +428,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 29f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 29f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 29f\n"
"mov x11, x12\n"
"29:" // Height 3: B setup done
@@ -450,27 +448,27 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"cbz x15, 30f\n"
"ld1h { z8.h }, p4/Z, [x15]\n"
"ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 32f\n"
"30:" // Height 3: no bias
"tbz %x[flags], #0, 31f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x13, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "add x21, x13, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x21]\n"
"ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
"ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
@@ -497,8 +495,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"33:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 34f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -524,13 +522,13 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"ld1h { z7.h }, p4/Z, [x11]\n"
"ble 37f\n"
"36:" // Height 3: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
+ "addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
- "ld1h { z21.h }, p4/Z, [x10]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z21.h }, p4/Z, [x10]\n"
"add x26, x26, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
@@ -538,18 +536,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"add x25, x25, #0x2\n"
"add x24, x24, #0x2\n"
- "addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z10.h, p4/M, z21.h, z0.h\n"
"fmla z14.h, p4/M, z21.h, z1.h\n"
"fmla z18.h, p4/M, z21.h, z2.h\n"
- "ld1h { z6.h }, p4/Z, [x12]\n"
"fmla z11.h, p4/M, z20.h, z0.h\n"
- "ld1rh { z0.h }, p4/Z, [x26]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, p4/M, z20.h, z1.h\n"
"fmla z19.h, p4/M, z20.h, z2.h\n"
+ "ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
+ "ld1h { z6.h }, p4/Z, [x12]\n"
"ld1h { z7.h }, p4/Z, [x11]\n"
"bgt 36b\n"
"37:" // Height 3: Multiply loop: Main loop skip
@@ -558,30 +556,30 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z12.h, p4/M, z6.h, z1.h\n"
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
- "ld1h { z21.h }, p4/Z, [x10]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "addvl x12, x12, #1\n"
+ "ld1h { z21.h }, p4/Z, [x10]\n"
+ "cmp x28, x20\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"ld1h { z20.h }, p4/Z, [x9]\n"
- "addvl x11, x11, #1\n"
- "cmp x28, x20\n"
- "addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, p4/M, z21.h, z0.h\n"
"fmla z14.h, p4/M, z21.h, z1.h\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, p4/M, z21.h, z2.h\n"
"fmla z11.h, p4/M, z20.h, z0.h\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, p4/M, z20.h, z1.h\n"
"fmla z19.h, p4/M, z20.h, z2.h\n"
"bne 33b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z21.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z21.h }, p4/Z, [x21]\n"
"ld1rh { z20.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z21.h\n"
"fmin z9.h, p4/M, z9.h, z21.h\n"
@@ -613,14 +611,14 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -628,15 +626,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 80f\n"
"40:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -644,12 +642,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 42f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 42f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 42f\n"
"mov x11, x12\n"
"42:" // Height 4: B setup done
@@ -664,18 +662,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"cbz x15, 43f\n"
"ld1h { z8.h }, p4/Z, [x15]\n"
"ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -683,13 +681,13 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"43:" // Height 4: no bias
"tbz %x[flags], #0, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x13, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "add x22, x13, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x22]\n"
"ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
"ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
@@ -724,8 +722,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -755,9 +753,9 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"ld1h { z7.h }, p4/Z, [x11]\n"
"ble 50f\n"
"49:" // Height 4: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
+ "addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
@@ -775,9 +773,9 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z10.h, p4/M, z25.h, z0.h\n"
"fmla z14.h, p4/M, z25.h, z1.h\n"
"addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z18.h, p4/M, z25.h, z2.h\n"
"fmla z22.h, p4/M, z25.h, z3.h\n"
+ "addvl x9, x9, #1\n"
"ld1h { z6.h }, p4/Z, [x12]\n"
"fmla z11.h, p4/M, z24.h, z0.h\n"
"fmla z15.h, p4/M, z24.h, z1.h\n"
@@ -797,18 +795,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
"ld1h { z25.h }, p4/Z, [x10]\n"
- "addvl x12, x12, #1\n"
+ "cmp x28, x20\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
+ "addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "addvl x10, x10, #1\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"ld1h { z24.h }, p4/Z, [x9]\n"
- "cmp x28, x20\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, p4/M, z25.h, z0.h\n"
"fmla z14.h, p4/M, z25.h, z1.h\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, p4/M, z25.h, z2.h\n"
"fmla z22.h, p4/M, z25.h, z3.h\n"
"fmla z11.h, p4/M, z24.h, z0.h\n"
@@ -817,13 +815,13 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z23.h, p4/M, z24.h, z3.h\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z25.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z25.h }, p4/Z, [x21]\n"
"ld1rh { z24.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z25.h\n"
"fmin z9.h, p4/M, z9.h, z25.h\n"
@@ -863,18 +861,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p3, [x24]\n"
- "st1h { z21.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p1, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p3, [x23]\n"
+ "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -882,15 +880,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 80f\n"
"53:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -898,12 +896,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 55f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 55f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 55f\n"
"mov x11, x12\n"
"55:" // Height 5: B setup done
@@ -918,18 +916,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"cbz x15, 56f\n"
"ld1h { z8.h }, p4/Z, [x15]\n"
"ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -941,16 +939,16 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"56:" // Height 5: no bias
"tbz %x[flags], #0, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "add x23, x13, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x23]\n"
"ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
"ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
"ld1h { z16.h }, p3/Z, [x22]\n"
@@ -991,8 +989,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"59:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 60f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1035,8 +1033,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x26, x26, #0x2\n"
"subs x27, x27, #0x1\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
- "ld1h { z29.h }, p4/Z, [x10]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z29.h }, p4/Z, [x10]\n"
"add x25, x25, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
@@ -1046,23 +1044,23 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z25.h, p4/M, z7.h, z4.h\n"
"ld1h { z28.h }, p4/Z, [x9]\n"
"add x22, x22, #0x2\n"
- "addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z10.h, p4/M, z29.h, z0.h\n"
"fmla z14.h, p4/M, z29.h, z1.h\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, p4/M, z29.h, z2.h\n"
"fmla z22.h, p4/M, z29.h, z3.h\n"
"fmla z26.h, p4/M, z29.h, z4.h\n"
- "ld1h { z6.h }, p4/Z, [x12]\n"
"fmla z11.h, p4/M, z28.h, z0.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
+ "ld1h { z6.h }, p4/Z, [x12]\n"
"fmla z15.h, p4/M, z28.h, z1.h\n"
- "ld1rh { z1.h }, p4/Z, [x25]\n"
"fmla z19.h, p4/M, z28.h, z2.h\n"
+ "ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
"fmla z23.h, p4/M, z28.h, z3.h\n"
- "ld1rh { z3.h }, p4/Z, [x23]\n"
"fmla z27.h, p4/M, z28.h, z4.h\n"
+ "ld1rh { z3.h }, p4/Z, [x23]\n"
"ld1rh { z4.h }, p4/Z, [x22]\n"
"ld1h { z7.h }, p4/Z, [x11]\n"
"bgt 62b\n"
@@ -1073,15 +1071,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
+ "cmp x28, x20\n"
"addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
- "ld1h { z29.h }, p4/Z, [x10]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "addvl x10, x10, #1\n"
+ "ld1h { z29.h }, p4/Z, [x10]\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
"ld1h { z28.h }, p4/Z, [x9]\n"
@@ -1098,14 +1096,14 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z27.h, p4/M, z28.h, z4.h\n"
"bne 59b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z29.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z29.h }, p4/Z, [x21]\n"
"ld1rh { z28.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z29.h\n"
"fmin z9.h, p4/M, z9.h, z29.h\n"
@@ -1153,22 +1151,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p3, [x24]\n"
- "st1h { z21.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p1, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p0, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p3, [x23]\n"
- "st1h { z25.h }, p2, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p1, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p0, [x23, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p3, [x23]\n"
+ "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p3, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1176,19 +1174,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 80f\n"
"66:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0xc\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0xc\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -1196,12 +1193,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 68f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 68f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 68f\n"
"mov x11, x12\n"
"68:" // Height 6: B setup done
@@ -1216,18 +1213,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"cbz x15, 69f\n"
"ld1h { z8.h }, p4/Z, [x15]\n"
"ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1243,17 +1240,17 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"69:" // Height 6: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x13, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
+ "add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "add x24, x13, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x24]\n"
"ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
"ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
"ld1h { z16.h }, p3/Z, [x23]\n"
@@ -1302,8 +1299,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"72:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1365,9 +1362,9 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z29.h, p4/M, z7.h, z5.h\n"
"ld1h { z7.h }, p4/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z10.h, p4/M, z6.h, z0.h\n"
"fmla z14.h, p4/M, z6.h, z1.h\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, p4/M, z6.h, z2.h\n"
"fmla z22.h, p4/M, z6.h, z3.h\n"
"fmla z26.h, p4/M, z6.h, z4.h\n"
@@ -1394,15 +1391,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
+ "cmp x28, x20\n"
"addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
"fmla z28.h, p4/M, z6.h, z5.h\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
@@ -1423,15 +1420,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"fmla z31.h, p4/M, z7.h, z5.h\n"
"bne 72b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z1.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z1.h }, p4/Z, [x21]\n"
"ld1rh { z0.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z1.h\n"
"fmin z9.h, p4/M, z9.h, z1.h\n"
@@ -1487,26 +1484,26 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p3, [x24]\n"
- "st1h { z21.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p1, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p0, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p3, [x23]\n"
- "st1h { z25.h }, p2, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p1, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p0, [x23, #3, MUL VL]\n"
- "st1h { z28.h }, p3, [x22]\n"
- "st1h { z29.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z30.h }, p1, [x22, #2, MUL VL]\n"
- "st1h { z31.h }, p0, [x22, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p3, [x23]\n"
+ "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p3, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
+ "st1h { z28.h }, p3, [x21]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z30.h }, p1, [x21, #2, MUL VL]\n"
+ "st1h { z31.h }, p0, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1523,8 +1520,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
index 78a84fd89b..d5ccf3476c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
@@ -49,19 +49,18 @@ void sve_ffhybrid_fp16_mla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -82,7 +81,6 @@ void sve_ffhybrid_fp16_mla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -108,15 +106,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bgt 29f\n"
"beq 15f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -124,12 +122,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 3f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 3f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 3f\n"
"mov x11, x12\n"
"3:" // Height 1: B setup done
@@ -164,8 +162,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -181,201 +179,201 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x12]\n"
- "ld1h { z16.h }, p5/Z, [x11]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "fmla z8.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
"fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ "fmla z10.h, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "fmla z10.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[2]\n"
"ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[2]\n"
"ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x12, #3, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[3]\n"
"ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[3]\n"
"ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x12, #4, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[4]\n"
"ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[4]\n"
"ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x12, #5, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[5]\n"
"ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[5]\n"
"ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x12, #6, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[6]\n"
"ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[6]\n"
"ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x12, #7, MUL VL]\n"
- "addvl x12, x12, #8\n"
"fmla z11.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[7]\n"
"ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #8\n"
- "fmla z8.h, z17.h, z0.h[7]\n"
- "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
"fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
"ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #8\n"
+ "cmp x27, #0x8\n"
"fmla z10.h, z17.h, z0.h[7]\n"
"fmla z11.h, z16.h, z0.h[7]\n"
+ "add x26, x26, #0x10\n"
+ "addvl x12, x12, #8\n"
+ "addvl x11, x11, #8\n"
+ "addvl x10, x10, #8\n"
+ "addvl x9, x9, #8\n"
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1rqh { z0.h }, p0/Z, [x26]\n"
+ "ld1h { z16.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z17.h, z0.h[0]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "fmla z8.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z9.h, z16.h, z0.h[0]\n"
- "ld1h { z16.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z17.h, z0.h[0]\n"
- "fmla z11.h, z16.h, z0.h[0]\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[1]\n"
"fmla z11.h, z16.h, z0.h[1]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[2]\n"
"fmla z11.h, z16.h, z0.h[2]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[3]\n"
"fmla z11.h, z16.h, z0.h[3]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[4]\n"
"fmla z11.h, z16.h, z0.h[4]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[5]\n"
"fmla z11.h, z16.h, z0.h[5]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[6]\n"
"fmla z11.h, z16.h, z0.h[6]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[7]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
"fmla z10.h, z17.h, z0.h[7]\n"
"fmla z11.h, z16.h, z0.h[7]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"12:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 7b\n"
"tbz %x[flags], #1, 13f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p5/Z, [x21]\n"
"ld1rh { z16.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z17.h\n"
"fmin z9.h, p5/M, z9.h, z17.h\n"
@@ -398,15 +396,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 86f\n"
"15:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -414,12 +412,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 17f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 17f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 17f\n"
"mov x11, x12\n"
"17:" // Height 2: B setup done
@@ -434,22 +432,22 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cbz x15, 18f\n"
"ld1h { z8.h }, p5/Z, [x15]\n"
"ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x15, x15, #4\n"
"b 20f\n"
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x13, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "add x20, x13, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x20]\n"
"ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
"ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
@@ -468,8 +466,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -488,29 +486,29 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x12]\n"
- "ld1h { z16.h }, p5/Z, [x11]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
"fmla z8.h, z17.h, z1.h[0]\n"
"fmla z12.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
"fmla z9.h, z16.h, z1.h[0]\n"
"fmla z13.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
"fmla z10.h, z17.h, z1.h[0]\n"
"fmla z14.h, z17.h, z0.h[0]\n"
"ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "cmp x27, #0x8\n"
"fmla z11.h, z16.h, z1.h[0]\n"
"fmla z15.h, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"fmla z8.h, z17.h, z1.h[1]\n"
"fmla z12.h, z17.h, z0.h[1]\n"
"ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z16.h, z1.h[1]\n"
"fmla z13.h, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
@@ -597,161 +595,161 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x12]\n"
- "ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
"fmla z8.h, z17.h, z0.h[0]\n"
"fmla z12.h, z17.h, z1.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[0]\n"
"fmla z13.h, z16.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
"fmla z10.h, z17.h, z0.h[0]\n"
"fmla z14.h, z17.h, z1.h[0]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z11.h, z16.h, z0.h[0]\n"
"fmla z15.h, z16.h, z1.h[0]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[1]\n"
"fmla z12.h, z17.h, z1.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[1]\n"
"fmla z13.h, z16.h, z1.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[1]\n"
"fmla z14.h, z17.h, z1.h[1]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.h, z16.h, z0.h[1]\n"
"fmla z15.h, z16.h, z1.h[1]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[2]\n"
"fmla z12.h, z17.h, z1.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[2]\n"
"fmla z13.h, z16.h, z1.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[2]\n"
"fmla z14.h, z17.h, z1.h[2]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.h, z16.h, z0.h[2]\n"
"fmla z15.h, z16.h, z1.h[2]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[3]\n"
"fmla z12.h, z17.h, z1.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[3]\n"
"fmla z13.h, z16.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[3]\n"
"fmla z14.h, z17.h, z1.h[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.h, z16.h, z0.h[3]\n"
"fmla z15.h, z16.h, z1.h[3]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[4]\n"
"fmla z12.h, z17.h, z1.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[4]\n"
"fmla z13.h, z16.h, z1.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[4]\n"
"fmla z14.h, z17.h, z1.h[4]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.h, z16.h, z0.h[4]\n"
"fmla z15.h, z16.h, z1.h[4]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[5]\n"
"fmla z12.h, z17.h, z1.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[5]\n"
"fmla z13.h, z16.h, z1.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[5]\n"
"fmla z14.h, z17.h, z1.h[5]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.h, z16.h, z0.h[5]\n"
"fmla z15.h, z16.h, z1.h[5]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[6]\n"
"fmla z12.h, z17.h, z1.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[6]\n"
"fmla z13.h, z16.h, z1.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[6]\n"
"fmla z14.h, z17.h, z1.h[6]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.h, z16.h, z0.h[6]\n"
"fmla z15.h, z16.h, z1.h[6]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1h { z17.h }, p5/Z, [x12]\n"
"ld1h { z16.h }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z17.h, z0.h[7]\n"
"fmla z12.h, z17.h, z1.h[7]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z16.h, z0.h[7]\n"
"fmla z13.h, z16.h, z1.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
"fmla z10.h, z17.h, z0.h[7]\n"
"fmla z14.h, z17.h, z1.h[7]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z11.h, z16.h, z0.h[7]\n"
"fmla z15.h, z16.h, z1.h[7]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"26:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 21b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p5/Z, [x21]\n"
"ld1rh { z16.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z17.h\n"
"fmin z9.h, p5/M, z9.h, z17.h\n"
@@ -775,10 +773,10 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
"28:" // Height 2: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -786,15 +784,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 86f\n"
"29:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -802,12 +800,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 31f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 31f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 31f\n"
"mov x11, x12\n"
"31:" // Height 3: B setup done
@@ -822,27 +820,27 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cbz x15, 32f\n"
"ld1h { z8.h }, p5/Z, [x15]\n"
"ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 34f\n"
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x13, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "add x21, x13, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x21]\n"
"ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
"ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
@@ -869,8 +867,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -892,126 +890,126 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z21.h }, p5/Z, [x12]\n"
- "ld1h { z20.h }, p5/Z, [x11]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z0.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
"fmla z8.h, z21.h, z2.h[0]\n"
"fmla z12.h, z21.h, z1.h[0]\n"
- "fmla z9.h, z20.h, z2.h[0]\n"
- "fmla z13.h, z20.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
"fmla z16.h, z21.h, z0.h[0]\n"
+ "fmla z9.h, z20.h, z2.h[0]\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
"fmla z17.h, z20.h, z0.h[0]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
+ "cmp x27, #0x8\n"
"fmla z10.h, z21.h, z2.h[0]\n"
"fmla z14.h, z21.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z18.h, z21.h, z0.h[0]\n"
- "ld1h { z21.h }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"fmla z15.h, z20.h, z1.h[0]\n"
"fmla z19.h, z20.h, z0.h[0]\n"
"ld1h { z20.h }, p5/Z, [x11, #1, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[1]\n"
"fmla z12.h, z21.h, z1.h[1]\n"
"fmla z16.h, z21.h, z0.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[1]\n"
"fmla z17.h, z20.h, z0.h[1]\n"
"ld1h { z20.h }, p5/Z, [x9, #1, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[1]\n"
"fmla z14.h, z21.h, z1.h[1]\n"
"fmla z18.h, z21.h, z0.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[1]\n"
"fmla z19.h, z20.h, z0.h[1]\n"
"ld1h { z20.h }, p5/Z, [x11, #2, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[2]\n"
"fmla z12.h, z21.h, z1.h[2]\n"
"fmla z16.h, z21.h, z0.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[2]\n"
"fmla z17.h, z20.h, z0.h[2]\n"
"ld1h { z20.h }, p5/Z, [x9, #2, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[2]\n"
"fmla z14.h, z21.h, z1.h[2]\n"
"fmla z18.h, z21.h, z0.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x12, #3, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #3, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[2]\n"
"fmla z19.h, z20.h, z0.h[2]\n"
"ld1h { z20.h }, p5/Z, [x11, #3, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[3]\n"
"fmla z12.h, z21.h, z1.h[3]\n"
"fmla z16.h, z21.h, z0.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[3]\n"
"fmla z17.h, z20.h, z0.h[3]\n"
"ld1h { z20.h }, p5/Z, [x9, #3, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[3]\n"
"fmla z14.h, z21.h, z1.h[3]\n"
"fmla z18.h, z21.h, z0.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x12, #4, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #4, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[3]\n"
"fmla z19.h, z20.h, z0.h[3]\n"
"ld1h { z20.h }, p5/Z, [x11, #4, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[4]\n"
"fmla z12.h, z21.h, z1.h[4]\n"
"fmla z16.h, z21.h, z0.h[4]\n"
- "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[4]\n"
"fmla z17.h, z20.h, z0.h[4]\n"
"ld1h { z20.h }, p5/Z, [x9, #4, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[4]\n"
"fmla z14.h, z21.h, z1.h[4]\n"
"fmla z18.h, z21.h, z0.h[4]\n"
- "ld1h { z21.h }, p5/Z, [x12, #5, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #5, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[4]\n"
"fmla z19.h, z20.h, z0.h[4]\n"
"ld1h { z20.h }, p5/Z, [x11, #5, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[5]\n"
"fmla z12.h, z21.h, z1.h[5]\n"
"fmla z16.h, z21.h, z0.h[5]\n"
- "ld1h { z21.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[5]\n"
"fmla z17.h, z20.h, z0.h[5]\n"
"ld1h { z20.h }, p5/Z, [x9, #5, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[5]\n"
"fmla z14.h, z21.h, z1.h[5]\n"
"fmla z18.h, z21.h, z0.h[5]\n"
- "ld1h { z21.h }, p5/Z, [x12, #6, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #6, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[5]\n"
"fmla z19.h, z20.h, z0.h[5]\n"
"ld1h { z20.h }, p5/Z, [x11, #6, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[6]\n"
"fmla z12.h, z21.h, z1.h[6]\n"
"fmla z16.h, z21.h, z0.h[6]\n"
- "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[6]\n"
"fmla z17.h, z20.h, z0.h[6]\n"
"ld1h { z20.h }, p5/Z, [x9, #6, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[6]\n"
"fmla z14.h, z21.h, z1.h[6]\n"
"fmla z18.h, z21.h, z0.h[6]\n"
+ "fmla z11.h, z20.h, z2.h[6]\n"
"ld1h { z21.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z11.h, z20.h, z2.h[6]\n"
"fmla z15.h, z20.h, z1.h[6]\n"
"fmla z19.h, z20.h, z0.h[6]\n"
"ld1h { z20.h }, p5/Z, [x11, #7, MUL VL]\n"
@@ -1019,9 +1017,9 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z8.h, z21.h, z2.h[7]\n"
"fmla z12.h, z21.h, z1.h[7]\n"
"fmla z16.h, z21.h, z0.h[7]\n"
+ "fmla z9.h, z20.h, z2.h[7]\n"
"ld1h { z21.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z9.h, z20.h, z2.h[7]\n"
"fmla z13.h, z20.h, z1.h[7]\n"
"fmla z17.h, z20.h, z0.h[7]\n"
"ld1h { z20.h }, p5/Z, [x9, #7, MUL VL]\n"
@@ -1035,179 +1033,179 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z21.h }, p5/Z, [x12]\n"
- "ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
"fmla z8.h, z21.h, z0.h[0]\n"
"fmla z12.h, z21.h, z1.h[0]\n"
- "fmla z9.h, z20.h, z0.h[0]\n"
- "fmla z13.h, z20.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
"fmla z16.h, z21.h, z2.h[0]\n"
+ "fmla z9.h, z20.h, z0.h[0]\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
"fmla z17.h, z20.h, z2.h[0]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[0]\n"
"fmla z14.h, z21.h, z1.h[0]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[0]\n"
"fmla z11.h, z20.h, z0.h[0]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[0]\n"
"fmla z19.h, z20.h, z2.h[0]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[1]\n"
"fmla z12.h, z21.h, z1.h[1]\n"
"fmla z16.h, z21.h, z2.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[1]\n"
"fmla z17.h, z20.h, z2.h[1]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[1]\n"
"fmla z14.h, z21.h, z1.h[1]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[1]\n"
"fmla z11.h, z20.h, z0.h[1]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[1]\n"
"fmla z19.h, z20.h, z2.h[1]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[2]\n"
"fmla z12.h, z21.h, z1.h[2]\n"
"fmla z16.h, z21.h, z2.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[2]\n"
"fmla z17.h, z20.h, z2.h[2]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[2]\n"
"fmla z14.h, z21.h, z1.h[2]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[2]\n"
"fmla z11.h, z20.h, z0.h[2]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[2]\n"
"fmla z19.h, z20.h, z2.h[2]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[3]\n"
"fmla z12.h, z21.h, z1.h[3]\n"
"fmla z16.h, z21.h, z2.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[3]\n"
"fmla z17.h, z20.h, z2.h[3]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[3]\n"
"fmla z14.h, z21.h, z1.h[3]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[3]\n"
"fmla z11.h, z20.h, z0.h[3]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[3]\n"
"fmla z19.h, z20.h, z2.h[3]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[4]\n"
"fmla z12.h, z21.h, z1.h[4]\n"
"fmla z16.h, z21.h, z2.h[4]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[4]\n"
"fmla z17.h, z20.h, z2.h[4]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[4]\n"
"fmla z14.h, z21.h, z1.h[4]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[4]\n"
"fmla z11.h, z20.h, z0.h[4]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[4]\n"
"fmla z19.h, z20.h, z2.h[4]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[5]\n"
"fmla z12.h, z21.h, z1.h[5]\n"
"fmla z16.h, z21.h, z2.h[5]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[5]\n"
"fmla z17.h, z20.h, z2.h[5]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[5]\n"
"fmla z14.h, z21.h, z1.h[5]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[5]\n"
"fmla z11.h, z20.h, z0.h[5]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[5]\n"
"fmla z19.h, z20.h, z2.h[5]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[6]\n"
"fmla z12.h, z21.h, z1.h[6]\n"
"fmla z16.h, z21.h, z2.h[6]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[6]\n"
"fmla z17.h, z20.h, z2.h[6]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.h, z21.h, z0.h[6]\n"
"fmla z14.h, z21.h, z1.h[6]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.h, z21.h, z2.h[6]\n"
"fmla z11.h, z20.h, z0.h[6]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.h, z20.h, z1.h[6]\n"
"fmla z19.h, z20.h, z2.h[6]\n"
"ble 40f\n"
"ld1h { z21.h }, p5/Z, [x12]\n"
"ld1h { z20.h }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z21.h, z0.h[7]\n"
"fmla z12.h, z21.h, z1.h[7]\n"
"fmla z16.h, z21.h, z2.h[7]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.h, z20.h, z0.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "addvl x12, x12, #1\n"
"fmla z13.h, z20.h, z1.h[7]\n"
"fmla z17.h, z20.h, z2.h[7]\n"
"ld1h { z20.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z10.h, z21.h, z0.h[7]\n"
"fmla z14.h, z21.h, z1.h[7]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z21.h, z2.h[7]\n"
"fmla z11.h, z20.h, z0.h[7]\n"
"fmla z15.h, z20.h, z1.h[7]\n"
@@ -1218,12 +1216,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 41f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z21.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z21.h }, p5/Z, [x21]\n"
"ld1rh { z20.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z21.h\n"
"fmin z9.h, p5/M, z9.h, z21.h\n"
@@ -1255,14 +1253,14 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
"42:" // Height 3: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1270,15 +1268,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 86f\n"
"43:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -1286,12 +1284,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 45f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 45f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 45f\n"
"mov x11, x12\n"
"45:" // Height 4: B setup done
@@ -1306,18 +1304,18 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cbz x15, 46f\n"
"ld1h { z8.h }, p5/Z, [x15]\n"
"ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1325,13 +1323,13 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x13, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "add x22, x13, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x22]\n"
"ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
"ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
@@ -1366,8 +1364,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1392,25 +1390,25 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x12]\n"
- "ld1h { z24.h }, p5/Z, [x11]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z3.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z2.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
"fmla z8.h, z25.h, z3.h[0]\n"
"fmla z12.h, z25.h, z2.h[0]\n"
- "fmla z9.h, z24.h, z3.h[0]\n"
- "fmla z13.h, z24.h, z2.h[0]\n"
"fmla z16.h, z25.h, z1.h[0]\n"
"fmla z20.h, z25.h, z0.h[0]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z9.h, z24.h, z3.h[0]\n"
+ "fmla z13.h, z24.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z17.h, z24.h, z1.h[0]\n"
"fmla z21.h, z24.h, z0.h[0]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
@@ -1569,22 +1567,22 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x12]\n"
- "ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
"fmla z8.h, z25.h, z0.h[0]\n"
"fmla z12.h, z25.h, z1.h[0]\n"
- "fmla z9.h, z24.h, z0.h[0]\n"
- "fmla z13.h, z24.h, z1.h[0]\n"
"fmla z16.h, z25.h, z2.h[0]\n"
"fmla z20.h, z25.h, z3.h[0]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z9.h, z24.h, z0.h[0]\n"
+ "fmla z13.h, z24.h, z1.h[0]\n"
+ "addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"fmla z17.h, z24.h, z2.h[0]\n"
"fmla z21.h, z24.h, z3.h[0]\n"
@@ -1601,23 +1599,23 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[1]\n"
"fmla z12.h, z25.h, z1.h[1]\n"
"fmla z16.h, z25.h, z2.h[1]\n"
"fmla z20.h, z25.h, z3.h[1]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[1]\n"
"fmla z13.h, z24.h, z1.h[1]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.h, z24.h, z2.h[1]\n"
"fmla z21.h, z24.h, z3.h[1]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, z25.h, z0.h[1]\n"
"fmla z14.h, z25.h, z1.h[1]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z25.h, z2.h[1]\n"
"fmla z22.h, z25.h, z3.h[1]\n"
"fmla z11.h, z24.h, z0.h[1]\n"
@@ -1627,23 +1625,23 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[2]\n"
"fmla z12.h, z25.h, z1.h[2]\n"
"fmla z16.h, z25.h, z2.h[2]\n"
"fmla z20.h, z25.h, z3.h[2]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[2]\n"
"fmla z13.h, z24.h, z1.h[2]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.h, z24.h, z2.h[2]\n"
"fmla z21.h, z24.h, z3.h[2]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, z25.h, z0.h[2]\n"
"fmla z14.h, z25.h, z1.h[2]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z25.h, z2.h[2]\n"
"fmla z22.h, z25.h, z3.h[2]\n"
"fmla z11.h, z24.h, z0.h[2]\n"
@@ -1653,23 +1651,23 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[3]\n"
"fmla z12.h, z25.h, z1.h[3]\n"
"fmla z16.h, z25.h, z2.h[3]\n"
"fmla z20.h, z25.h, z3.h[3]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[3]\n"
"fmla z13.h, z24.h, z1.h[3]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.h, z24.h, z2.h[3]\n"
"fmla z21.h, z24.h, z3.h[3]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, z25.h, z0.h[3]\n"
"fmla z14.h, z25.h, z1.h[3]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z25.h, z2.h[3]\n"
"fmla z22.h, z25.h, z3.h[3]\n"
"fmla z11.h, z24.h, z0.h[3]\n"
@@ -1679,23 +1677,23 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[4]\n"
"fmla z12.h, z25.h, z1.h[4]\n"
"fmla z16.h, z25.h, z2.h[4]\n"
"fmla z20.h, z25.h, z3.h[4]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[4]\n"
"fmla z13.h, z24.h, z1.h[4]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.h, z24.h, z2.h[4]\n"
"fmla z21.h, z24.h, z3.h[4]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, z25.h, z0.h[4]\n"
"fmla z14.h, z25.h, z1.h[4]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z25.h, z2.h[4]\n"
"fmla z22.h, z25.h, z3.h[4]\n"
"fmla z11.h, z24.h, z0.h[4]\n"
@@ -1705,23 +1703,23 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[5]\n"
"fmla z12.h, z25.h, z1.h[5]\n"
"fmla z16.h, z25.h, z2.h[5]\n"
"fmla z20.h, z25.h, z3.h[5]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[5]\n"
"fmla z13.h, z24.h, z1.h[5]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.h, z24.h, z2.h[5]\n"
"fmla z21.h, z24.h, z3.h[5]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, z25.h, z0.h[5]\n"
"fmla z14.h, z25.h, z1.h[5]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z25.h, z2.h[5]\n"
"fmla z22.h, z25.h, z3.h[5]\n"
"fmla z11.h, z24.h, z0.h[5]\n"
@@ -1731,23 +1729,23 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[6]\n"
"fmla z12.h, z25.h, z1.h[6]\n"
"fmla z16.h, z25.h, z2.h[6]\n"
"fmla z20.h, z25.h, z3.h[6]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[6]\n"
"fmla z13.h, z24.h, z1.h[6]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.h, z24.h, z2.h[6]\n"
"fmla z21.h, z24.h, z3.h[6]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.h, z25.h, z0.h[6]\n"
"fmla z14.h, z25.h, z1.h[6]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.h, z25.h, z2.h[6]\n"
"fmla z22.h, z25.h, z3.h[6]\n"
"fmla z11.h, z24.h, z0.h[6]\n"
@@ -1757,16 +1755,16 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 54f\n"
"ld1h { z25.h }, p5/Z, [x12]\n"
"ld1h { z24.h }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z25.h, z0.h[7]\n"
"fmla z12.h, z25.h, z1.h[7]\n"
"fmla z16.h, z25.h, z2.h[7]\n"
"fmla z20.h, z25.h, z3.h[7]\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z9.h, z24.h, z0.h[7]\n"
"fmla z13.h, z24.h, z1.h[7]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z24.h, z2.h[7]\n"
"fmla z21.h, z24.h, z3.h[7]\n"
"ld1h { z24.h }, p5/Z, [x9]\n"
@@ -1785,13 +1783,13 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 55f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z25.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z25.h }, p5/Z, [x21]\n"
"ld1rh { z24.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z25.h\n"
"fmin z9.h, p5/M, z9.h, z25.h\n"
@@ -1831,18 +1829,18 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p4, [x24]\n"
- "st1h { z21.h }, p3, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x23]\n"
+ "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
"56:" // Height 4: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1850,15 +1848,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 86f\n"
"57:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"58:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -1866,12 +1864,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 59f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 59f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 59f\n"
"mov x11, x12\n"
"59:" // Height 5: B setup done
@@ -1886,18 +1884,18 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cbz x15, 60f\n"
"ld1h { z8.h }, p5/Z, [x15]\n"
"ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1909,16 +1907,16 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"60:" // Height 5: no bias
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "add x23, x13, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x23]\n"
"ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
"ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
"ld1h { z16.h }, p4/Z, [x22]\n"
@@ -1959,8 +1957,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"63:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1988,29 +1986,29 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 67f\n"
"66:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x12]\n"
- "ld1h { z28.h }, p5/Z, [x11]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z4.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
"fmla z8.h, z29.h, z4.h[0]\n"
"fmla z12.h, z29.h, z3.h[0]\n"
- "fmla z9.h, z28.h, z4.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
"fmla z16.h, z29.h, z2.h[0]\n"
"fmla z20.h, z29.h, z1.h[0]\n"
+ "add x25, x25, #0x10\n"
"fmla z24.h, z29.h, z0.h[0]\n"
- "fmla z13.h, z28.h, z3.h[0]\n"
+ "fmla z9.h, z28.h, z4.h[0]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z28.h, z3.h[0]\n"
"fmla z17.h, z28.h, z2.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z21.h, z28.h, z1.h[0]\n"
"fmla z25.h, z28.h, z0.h[0]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2019,8 +2017,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[0]\n"
"fmla z22.h, z29.h, z1.h[0]\n"
"fmla z26.h, z29.h, z0.h[0]\n"
- "ld1h { z29.h }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[0]\n"
"fmla z19.h, z28.h, z2.h[0]\n"
"fmla z23.h, z28.h, z1.h[0]\n"
@@ -2031,8 +2029,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[1]\n"
"fmla z20.h, z29.h, z1.h[1]\n"
"fmla z24.h, z29.h, z0.h[1]\n"
- "ld1h { z29.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[1]\n"
"fmla z17.h, z28.h, z2.h[1]\n"
"fmla z21.h, z28.h, z1.h[1]\n"
@@ -2043,8 +2041,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[1]\n"
"fmla z22.h, z29.h, z1.h[1]\n"
"fmla z26.h, z29.h, z0.h[1]\n"
- "ld1h { z29.h }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[1]\n"
"fmla z19.h, z28.h, z2.h[1]\n"
"fmla z23.h, z28.h, z1.h[1]\n"
@@ -2055,8 +2053,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[2]\n"
"fmla z20.h, z29.h, z1.h[2]\n"
"fmla z24.h, z29.h, z0.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[2]\n"
"fmla z17.h, z28.h, z2.h[2]\n"
"fmla z21.h, z28.h, z1.h[2]\n"
@@ -2067,8 +2065,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[2]\n"
"fmla z22.h, z29.h, z1.h[2]\n"
"fmla z26.h, z29.h, z0.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x12, #3, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #3, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[2]\n"
"fmla z19.h, z28.h, z2.h[2]\n"
"fmla z23.h, z28.h, z1.h[2]\n"
@@ -2079,8 +2077,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[3]\n"
"fmla z20.h, z29.h, z1.h[3]\n"
"fmla z24.h, z29.h, z0.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[3]\n"
"fmla z17.h, z28.h, z2.h[3]\n"
"fmla z21.h, z28.h, z1.h[3]\n"
@@ -2091,8 +2089,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[3]\n"
"fmla z22.h, z29.h, z1.h[3]\n"
"fmla z26.h, z29.h, z0.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x12, #4, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #4, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[3]\n"
"fmla z19.h, z28.h, z2.h[3]\n"
"fmla z23.h, z28.h, z1.h[3]\n"
@@ -2103,8 +2101,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[4]\n"
"fmla z20.h, z29.h, z1.h[4]\n"
"fmla z24.h, z29.h, z0.h[4]\n"
- "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[4]\n"
"fmla z17.h, z28.h, z2.h[4]\n"
"fmla z21.h, z28.h, z1.h[4]\n"
@@ -2115,8 +2113,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[4]\n"
"fmla z22.h, z29.h, z1.h[4]\n"
"fmla z26.h, z29.h, z0.h[4]\n"
- "ld1h { z29.h }, p5/Z, [x12, #5, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #5, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[4]\n"
"fmla z19.h, z28.h, z2.h[4]\n"
"fmla z23.h, z28.h, z1.h[4]\n"
@@ -2127,8 +2125,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[5]\n"
"fmla z20.h, z29.h, z1.h[5]\n"
"fmla z24.h, z29.h, z0.h[5]\n"
- "ld1h { z29.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[5]\n"
"fmla z17.h, z28.h, z2.h[5]\n"
"fmla z21.h, z28.h, z1.h[5]\n"
@@ -2139,8 +2137,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[5]\n"
"fmla z22.h, z29.h, z1.h[5]\n"
"fmla z26.h, z29.h, z0.h[5]\n"
- "ld1h { z29.h }, p5/Z, [x12, #6, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #6, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[5]\n"
"fmla z19.h, z28.h, z2.h[5]\n"
"fmla z23.h, z28.h, z1.h[5]\n"
@@ -2151,8 +2149,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[6]\n"
"fmla z20.h, z29.h, z1.h[6]\n"
"fmla z24.h, z29.h, z0.h[6]\n"
- "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[6]\n"
"fmla z17.h, z28.h, z2.h[6]\n"
"fmla z21.h, z28.h, z1.h[6]\n"
@@ -2163,30 +2161,30 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[6]\n"
"fmla z22.h, z29.h, z1.h[6]\n"
"fmla z26.h, z29.h, z0.h[6]\n"
+ "fmla z11.h, z28.h, z4.h[6]\n"
"ld1h { z29.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z11.h, z28.h, z4.h[6]\n"
"fmla z15.h, z28.h, z3.h[6]\n"
"fmla z19.h, z28.h, z2.h[6]\n"
"fmla z23.h, z28.h, z1.h[6]\n"
"fmla z27.h, z28.h, z0.h[6]\n"
"ld1h { z28.h }, p5/Z, [x11, #7, MUL VL]\n"
- "fmla z8.h, z29.h, z4.h[7]\n"
"addvl x11, x11, #8\n"
+ "fmla z8.h, z29.h, z4.h[7]\n"
"fmla z12.h, z29.h, z3.h[7]\n"
"fmla z16.h, z29.h, z2.h[7]\n"
"fmla z20.h, z29.h, z1.h[7]\n"
"fmla z24.h, z29.h, z0.h[7]\n"
+ "fmla z9.h, z28.h, z4.h[7]\n"
"ld1h { z29.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z9.h, z28.h, z4.h[7]\n"
"fmla z13.h, z28.h, z3.h[7]\n"
"fmla z17.h, z28.h, z2.h[7]\n"
"fmla z21.h, z28.h, z1.h[7]\n"
"fmla z25.h, z28.h, z0.h[7]\n"
"ld1h { z28.h }, p5/Z, [x9, #7, MUL VL]\n"
- "fmla z10.h, z29.h, z4.h[7]\n"
"addvl x9, x9, #8\n"
+ "fmla z10.h, z29.h, z4.h[7]\n"
"fmla z14.h, z29.h, z3.h[7]\n"
"fmla z18.h, z29.h, z2.h[7]\n"
"fmla z22.h, z29.h, z1.h[7]\n"
@@ -2199,25 +2197,25 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bgt 66b\n"
"67:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x12]\n"
- "ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
"fmla z8.h, z29.h, z0.h[0]\n"
"fmla z12.h, z29.h, z1.h[0]\n"
- "fmla z9.h, z28.h, z0.h[0]\n"
- "fmla z13.h, z28.h, z1.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
"fmla z16.h, z29.h, z2.h[0]\n"
"fmla z20.h, z29.h, z3.h[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[0]\n"
- "fmla z17.h, z28.h, z2.h[0]\n"
+ "fmla z9.h, z28.h, z0.h[0]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z13.h, z28.h, z1.h[0]\n"
+ "fmla z17.h, z28.h, z2.h[0]\n"
"addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[0]\n"
"fmla z25.h, z28.h, z4.h[0]\n"
@@ -2236,19 +2234,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[1]\n"
"fmla z12.h, z29.h, z1.h[1]\n"
"fmla z16.h, z29.h, z2.h[1]\n"
"fmla z20.h, z29.h, z3.h[1]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[1]\n"
"fmla z9.h, z28.h, z0.h[1]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, z28.h, z1.h[1]\n"
"fmla z17.h, z28.h, z2.h[1]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[1]\n"
"fmla z25.h, z28.h, z4.h[1]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2266,19 +2264,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[2]\n"
"fmla z12.h, z29.h, z1.h[2]\n"
"fmla z16.h, z29.h, z2.h[2]\n"
"fmla z20.h, z29.h, z3.h[2]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[2]\n"
"fmla z9.h, z28.h, z0.h[2]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, z28.h, z1.h[2]\n"
"fmla z17.h, z28.h, z2.h[2]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[2]\n"
"fmla z25.h, z28.h, z4.h[2]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2296,19 +2294,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[3]\n"
"fmla z12.h, z29.h, z1.h[3]\n"
"fmla z16.h, z29.h, z2.h[3]\n"
"fmla z20.h, z29.h, z3.h[3]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[3]\n"
"fmla z9.h, z28.h, z0.h[3]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, z28.h, z1.h[3]\n"
"fmla z17.h, z28.h, z2.h[3]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[3]\n"
"fmla z25.h, z28.h, z4.h[3]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2326,19 +2324,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[4]\n"
"fmla z12.h, z29.h, z1.h[4]\n"
"fmla z16.h, z29.h, z2.h[4]\n"
"fmla z20.h, z29.h, z3.h[4]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[4]\n"
"fmla z9.h, z28.h, z0.h[4]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, z28.h, z1.h[4]\n"
"fmla z17.h, z28.h, z2.h[4]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[4]\n"
"fmla z25.h, z28.h, z4.h[4]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2356,19 +2354,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[5]\n"
"fmla z12.h, z29.h, z1.h[5]\n"
"fmla z16.h, z29.h, z2.h[5]\n"
"fmla z20.h, z29.h, z3.h[5]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[5]\n"
"fmla z9.h, z28.h, z0.h[5]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, z28.h, z1.h[5]\n"
"fmla z17.h, z28.h, z2.h[5]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[5]\n"
"fmla z25.h, z28.h, z4.h[5]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2386,19 +2384,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[6]\n"
"fmla z12.h, z29.h, z1.h[6]\n"
"fmla z16.h, z29.h, z2.h[6]\n"
"fmla z20.h, z29.h, z3.h[6]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z29.h, z4.h[6]\n"
"fmla z9.h, z28.h, z0.h[6]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.h, z28.h, z1.h[6]\n"
"fmla z17.h, z28.h, z2.h[6]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, z28.h, z3.h[6]\n"
"fmla z25.h, z28.h, z4.h[6]\n"
"ld1h { z28.h }, p5/Z, [x9]\n"
@@ -2416,12 +2414,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 68f\n"
"ld1h { z29.h }, p5/Z, [x12]\n"
"ld1h { z28.h }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z29.h, z0.h[7]\n"
"fmla z12.h, z29.h, z1.h[7]\n"
"fmla z16.h, z29.h, z2.h[7]\n"
"fmla z20.h, z29.h, z3.h[7]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z24.h, z29.h, z4.h[7]\n"
"fmla z9.h, z28.h, z0.h[7]\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
@@ -2448,14 +2446,14 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 63b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 69f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z29.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z29.h }, p5/Z, [x21]\n"
"ld1rh { z28.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z29.h\n"
"fmin z9.h, p5/M, z9.h, z29.h\n"
@@ -2503,22 +2501,22 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p4, [x24]\n"
- "st1h { z21.h }, p3, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p4, [x23]\n"
- "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x23]\n"
+ "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p4, [x22]\n"
+ "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
"70:" // Height 5: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -2526,19 +2524,18 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 86f\n"
"71:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0xc\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0xc\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"72:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cnth x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #1\n"
+ "cnth x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
"add x20, x9, x20, LSL #1\n"
@@ -2546,12 +2543,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 73f\n"
"dech x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 73f\n"
"dech x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 73f\n"
"mov x11, x12\n"
"73:" // Height 6: B setup done
@@ -2566,18 +2563,18 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cbz x15, 74f\n"
"ld1h { z8.h }, p5/Z, [x15]\n"
"ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -2593,17 +2590,17 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"74:" // Height 6: no bias
"tbz %x[flags], #0, 75f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x13, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
+ "add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "add x24, x13, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x24]\n"
"ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
"ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
"ld1h { z16.h }, p4/Z, [x23]\n"
@@ -2652,8 +2649,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"77:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 78f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2684,29 +2681,29 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 81f\n"
"80:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x12]\n"
- "ld1h { z0.h }, p5/Z, [x11]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z7.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z6.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z5.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
"ld1rqh { z2.h }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1h { z1.h }, p5/Z, [x12]\n"
+ "ld1h { z0.h }, p5/Z, [x11]\n"
"fmla z8.h, z1.h, z7.h[0]\n"
"fmla z12.h, z1.h, z6.h[0]\n"
- "add x21, x21, #0x10\n"
"fmla z16.h, z1.h, z5.h[0]\n"
"fmla z20.h, z1.h, z4.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z24.h, z1.h, z3.h[0]\n"
"fmla z28.h, z1.h, z2.h[0]\n"
"ld1h { z1.h }, p5/Z, [x10]\n"
+ "add x21, x21, #0x10\n"
"fmla z9.h, z0.h, z7.h[0]\n"
"fmla z13.h, z0.h, z6.h[0]\n"
"fmla z17.h, z0.h, z5.h[0]\n"
@@ -2929,27 +2926,27 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bgt 80b\n"
"81:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
"ld1rqh { z5.h }, p0/Z, [x21]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
"fmla z8.h, z7.h, z0.h[0]\n"
"fmla z12.h, z7.h, z1.h[0]\n"
- "fmla z9.h, z6.h, z0.h[0]\n"
- "fmla z13.h, z6.h, z1.h[0]\n"
"fmla z16.h, z7.h, z2.h[0]\n"
"fmla z20.h, z7.h, z3.h[0]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z24.h, z7.h, z4.h[0]\n"
"fmla z28.h, z7.h, z5.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
+ "fmla z9.h, z6.h, z0.h[0]\n"
+ "fmla z13.h, z6.h, z1.h[0]\n"
"fmla z17.h, z6.h, z2.h[0]\n"
"fmla z21.h, z6.h, z3.h[0]\n"
"fmla z25.h, z6.h, z4.h[0]\n"
@@ -2971,19 +2968,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[1]\n"
"fmla z12.h, z7.h, z1.h[1]\n"
"fmla z16.h, z7.h, z2.h[1]\n"
"fmla z20.h, z7.h, z3.h[1]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z7.h, z4.h[1]\n"
"fmla z28.h, z7.h, z5.h[1]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, z6.h, z0.h[1]\n"
"fmla z13.h, z6.h, z1.h[1]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z6.h, z2.h[1]\n"
"fmla z21.h, z6.h, z3.h[1]\n"
"fmla z25.h, z6.h, z4.h[1]\n"
@@ -3005,19 +3002,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[2]\n"
"fmla z12.h, z7.h, z1.h[2]\n"
"fmla z16.h, z7.h, z2.h[2]\n"
"fmla z20.h, z7.h, z3.h[2]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z7.h, z4.h[2]\n"
"fmla z28.h, z7.h, z5.h[2]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, z6.h, z0.h[2]\n"
"fmla z13.h, z6.h, z1.h[2]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z6.h, z2.h[2]\n"
"fmla z21.h, z6.h, z3.h[2]\n"
"fmla z25.h, z6.h, z4.h[2]\n"
@@ -3039,19 +3036,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[3]\n"
"fmla z12.h, z7.h, z1.h[3]\n"
"fmla z16.h, z7.h, z2.h[3]\n"
"fmla z20.h, z7.h, z3.h[3]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z7.h, z4.h[3]\n"
"fmla z28.h, z7.h, z5.h[3]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, z6.h, z0.h[3]\n"
"fmla z13.h, z6.h, z1.h[3]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z6.h, z2.h[3]\n"
"fmla z21.h, z6.h, z3.h[3]\n"
"fmla z25.h, z6.h, z4.h[3]\n"
@@ -3073,19 +3070,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[4]\n"
"fmla z12.h, z7.h, z1.h[4]\n"
"fmla z16.h, z7.h, z2.h[4]\n"
"fmla z20.h, z7.h, z3.h[4]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z7.h, z4.h[4]\n"
"fmla z28.h, z7.h, z5.h[4]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, z6.h, z0.h[4]\n"
"fmla z13.h, z6.h, z1.h[4]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z6.h, z2.h[4]\n"
"fmla z21.h, z6.h, z3.h[4]\n"
"fmla z25.h, z6.h, z4.h[4]\n"
@@ -3107,19 +3104,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[5]\n"
"fmla z12.h, z7.h, z1.h[5]\n"
"fmla z16.h, z7.h, z2.h[5]\n"
"fmla z20.h, z7.h, z3.h[5]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z7.h, z4.h[5]\n"
"fmla z28.h, z7.h, z5.h[5]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, z6.h, z0.h[5]\n"
"fmla z13.h, z6.h, z1.h[5]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z6.h, z2.h[5]\n"
"fmla z21.h, z6.h, z3.h[5]\n"
"fmla z25.h, z6.h, z4.h[5]\n"
@@ -3141,19 +3138,19 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[6]\n"
"fmla z12.h, z7.h, z1.h[6]\n"
"fmla z16.h, z7.h, z2.h[6]\n"
"fmla z20.h, z7.h, z3.h[6]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.h, z7.h, z4.h[6]\n"
"fmla z28.h, z7.h, z5.h[6]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.h, z6.h, z0.h[6]\n"
"fmla z13.h, z6.h, z1.h[6]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.h, z6.h, z2.h[6]\n"
"fmla z21.h, z6.h, z3.h[6]\n"
"fmla z25.h, z6.h, z4.h[6]\n"
@@ -3175,12 +3172,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ble 82f\n"
"ld1h { z7.h }, p5/Z, [x12]\n"
"ld1h { z6.h }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.h, z7.h, z0.h[7]\n"
"fmla z12.h, z7.h, z1.h[7]\n"
"fmla z16.h, z7.h, z2.h[7]\n"
"fmla z20.h, z7.h, z3.h[7]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z24.h, z7.h, z4.h[7]\n"
"fmla z28.h, z7.h, z5.h[7]\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
@@ -3211,15 +3208,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 77b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"tbz %x[flags], #1, 83f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z1.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z1.h }, p5/Z, [x21]\n"
"ld1rh { z0.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z1.h\n"
"fmin z9.h, p5/M, z9.h, z1.h\n"
@@ -3275,26 +3272,26 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p4, [x24]\n"
- "st1h { z21.h }, p3, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p4, [x23]\n"
- "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
- "st1h { z28.h }, p4, [x22]\n"
- "st1h { z29.h }, p3, [x22, #1, MUL VL]\n"
- "st1h { z30.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z31.h }, p1, [x22, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x23]\n"
+ "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p4, [x22]\n"
+ "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
+ "st1h { z28.h }, p4, [x21]\n"
+ "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
"84:" // Height 6: Writeback done
"dech x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -3311,8 +3308,8 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"86:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
index 847103646c..ffa365b8a0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
@@ -82,7 +82,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
index 51e2b3722a..53cd52fe56 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -49,19 +49,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -82,7 +81,6 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -108,15 +106,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"bgt 27f\n"
"beq 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -124,12 +122,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 3f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 3f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 3f\n"
"mov x11, x12\n"
"3:" // Height 1: B setup done
@@ -164,8 +162,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -183,41 +181,41 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"ld1w { z7.s }, p4/Z, [x11]\n"
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
- "ld1w { z17.s }, p4/Z, [x10]\n"
- "addvl x11, x11, #1\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z17.s }, p4/Z, [x10]\n"
"ld1w { z16.s }, p4/Z, [x9]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
"add x26, x26, #0x4\n"
"subs x27, x27, #0x1\n"
+ "ld1rw { z0.s }, p4/Z, [x26]\n"
+ "ld1w { z6.s }, p4/Z, [x12]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, p4/M, z17.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x12]\n"
- "fmla z11.s, p4/M, z16.s, z0.s\n"
- "ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z7.s }, p4/Z, [x11]\n"
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
- "ld1w { z17.s }, p4/Z, [x10]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z17.s }, p4/Z, [x10]\n"
"ld1w { z16.s }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "cmp x28, x20\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, p4/M, z17.s, z0.s\n"
- "fmla z11.s, p4/M, z16.s, z0.s\n"
"bne 7b\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p4/Z, [x21]\n"
"ld1rw { z16.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z17.s\n"
"fmin z9.s, p4/M, z9.s, z17.s\n"
@@ -240,15 +238,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 80f\n"
"14:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -256,12 +254,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 16f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 16f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 16f\n"
"mov x11, x12\n"
"16:" // Height 2: B setup done
@@ -276,22 +274,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"cbz x15, 17f\n"
"ld1w { z8.s }, p4/Z, [x15]\n"
"ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x15, x15, #4\n"
"b 19f\n"
"17:" // Height 2: no bias
"tbz %x[flags], #0, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x13, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "add x20, x13, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x20]\n"
"ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
@@ -310,8 +308,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"20:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 21f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -333,26 +331,26 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"ld1w { z7.s }, p4/Z, [x11]\n"
"ble 24f\n"
"23:" // Height 2: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
"ld1w { z17.s }, p4/Z, [x10]\n"
- "addvl x11, x11, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"ld1w { z16.s }, p4/Z, [x9]\n"
+ "addvl x11, x11, #1\n"
"add x26, x26, #0x4\n"
"subs x27, x27, #0x1\n"
- "add x25, x25, #0x4\n"
- "addvl x10, x10, #1\n"
"fmla z10.s, p4/M, z17.s, z0.s\n"
"fmla z14.s, p4/M, z17.s, z1.s\n"
- "addvl x9, x9, #1\n"
- "ld1w { z6.s }, p4/Z, [x12]\n"
+ "add x25, x25, #0x4\n"
"fmla z11.s, p4/M, z16.s, z0.s\n"
- "ld1rw { z0.s }, p4/Z, [x26]\n"
"fmla z15.s, p4/M, z16.s, z1.s\n"
+ "addvl x10, x10, #1\n"
+ "ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
+ "addvl x9, x9, #1\n"
+ "ld1w { z6.s }, p4/Z, [x12]\n"
"ld1w { z7.s }, p4/Z, [x11]\n"
"bgt 23b\n"
"24:" // Height 2: Multiply loop: Main loop skip
@@ -364,22 +362,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z13.s, p4/M, z7.s, z1.s\n"
"ld1w { z16.s }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"cmp x28, x20\n"
- "addvl x10, x10, #1\n"
"fmla z10.s, p4/M, z17.s, z0.s\n"
"fmla z14.s, p4/M, z17.s, z1.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z11.s, p4/M, z16.s, z0.s\n"
"fmla z15.s, p4/M, z16.s, z1.s\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"bne 20b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p4/Z, [x21]\n"
"ld1rw { z16.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z17.s\n"
"fmin z9.s, p4/M, z9.s, z17.s\n"
@@ -403,10 +401,10 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -414,15 +412,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 80f\n"
"27:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -430,12 +428,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 29f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 29f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 29f\n"
"mov x11, x12\n"
"29:" // Height 3: B setup done
@@ -450,27 +448,27 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"cbz x15, 30f\n"
"ld1w { z8.s }, p4/Z, [x15]\n"
"ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 32f\n"
"30:" // Height 3: no bias
"tbz %x[flags], #0, 31f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x13, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "add x21, x13, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x21]\n"
"ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
@@ -497,8 +495,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"33:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 34f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -524,13 +522,13 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"ld1w { z7.s }, p4/Z, [x11]\n"
"ble 37f\n"
"36:" // Height 3: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
- "ld1w { z21.s }, p4/Z, [x10]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z21.s }, p4/Z, [x10]\n"
"add x26, x26, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
@@ -538,18 +536,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
- "addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z10.s, p4/M, z21.s, z0.s\n"
"fmla z14.s, p4/M, z21.s, z1.s\n"
"fmla z18.s, p4/M, z21.s, z2.s\n"
- "ld1w { z6.s }, p4/Z, [x12]\n"
"fmla z11.s, p4/M, z20.s, z0.s\n"
- "ld1rw { z0.s }, p4/Z, [x26]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"fmla z15.s, p4/M, z20.s, z1.s\n"
"fmla z19.s, p4/M, z20.s, z2.s\n"
+ "ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
+ "ld1w { z6.s }, p4/Z, [x12]\n"
"ld1w { z7.s }, p4/Z, [x11]\n"
"bgt 36b\n"
"37:" // Height 3: Multiply loop: Main loop skip
@@ -558,30 +556,30 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z12.s, p4/M, z6.s, z1.s\n"
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
- "ld1w { z21.s }, p4/Z, [x10]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "addvl x12, x12, #1\n"
+ "ld1w { z21.s }, p4/Z, [x10]\n"
+ "cmp x28, x20\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"ld1w { z20.s }, p4/Z, [x9]\n"
- "addvl x11, x11, #1\n"
- "cmp x28, x20\n"
- "addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.s, p4/M, z21.s, z0.s\n"
"fmla z14.s, p4/M, z21.s, z1.s\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.s, p4/M, z21.s, z2.s\n"
"fmla z11.s, p4/M, z20.s, z0.s\n"
+ "addvl x9, x9, #1\n"
"fmla z15.s, p4/M, z20.s, z1.s\n"
"fmla z19.s, p4/M, z20.s, z2.s\n"
"bne 33b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p4/Z, [x21]\n"
"ld1rw { z20.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z21.s\n"
"fmin z9.s, p4/M, z9.s, z21.s\n"
@@ -613,14 +611,14 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -628,15 +626,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 80f\n"
"40:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -644,12 +642,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 42f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 42f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 42f\n"
"mov x11, x12\n"
"42:" // Height 4: B setup done
@@ -664,18 +662,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"cbz x15, 43f\n"
"ld1w { z8.s }, p4/Z, [x15]\n"
"ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -683,13 +681,13 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"43:" // Height 4: no bias
"tbz %x[flags], #0, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x13, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "add x22, x13, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x22]\n"
"ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
@@ -724,8 +722,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -755,9 +753,9 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"ld1w { z7.s }, p4/Z, [x11]\n"
"ble 50f\n"
"49:" // Height 4: Multiply loop: Main loop
- "addvl x12, x12, #1\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
@@ -775,9 +773,9 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z10.s, p4/M, z25.s, z0.s\n"
"fmla z14.s, p4/M, z25.s, z1.s\n"
"addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z18.s, p4/M, z25.s, z2.s\n"
"fmla z22.s, p4/M, z25.s, z3.s\n"
+ "addvl x9, x9, #1\n"
"ld1w { z6.s }, p4/Z, [x12]\n"
"fmla z11.s, p4/M, z24.s, z0.s\n"
"fmla z15.s, p4/M, z24.s, z1.s\n"
@@ -797,18 +795,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
"ld1w { z25.s }, p4/Z, [x10]\n"
- "addvl x12, x12, #1\n"
+ "cmp x28, x20\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "addvl x10, x10, #1\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"ld1w { z24.s }, p4/Z, [x9]\n"
- "cmp x28, x20\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.s, p4/M, z25.s, z0.s\n"
"fmla z14.s, p4/M, z25.s, z1.s\n"
+ "addvl x9, x9, #1\n"
"fmla z18.s, p4/M, z25.s, z2.s\n"
"fmla z22.s, p4/M, z25.s, z3.s\n"
"fmla z11.s, p4/M, z24.s, z0.s\n"
@@ -817,13 +815,13 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z23.s, p4/M, z24.s, z3.s\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p4/Z, [x21]\n"
"ld1rw { z24.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z25.s\n"
"fmin z9.s, p4/M, z9.s, z25.s\n"
@@ -863,18 +861,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x24]\n"
- "st1w { z21.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x23]\n"
+ "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -882,15 +880,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 80f\n"
"53:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -898,12 +896,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 55f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 55f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 55f\n"
"mov x11, x12\n"
"55:" // Height 5: B setup done
@@ -918,18 +916,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"cbz x15, 56f\n"
"ld1w { z8.s }, p4/Z, [x15]\n"
"ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -941,16 +939,16 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"56:" // Height 5: no bias
"tbz %x[flags], #0, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "add x23, x13, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x23]\n"
"ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x22]\n"
@@ -991,8 +989,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"59:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 60f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1035,8 +1033,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x26, x26, #0x4\n"
"subs x27, x27, #0x1\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
- "ld1w { z29.s }, p4/Z, [x10]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z29.s }, p4/Z, [x10]\n"
"add x25, x25, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
@@ -1046,23 +1044,23 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z25.s, p4/M, z7.s, z4.s\n"
"ld1w { z28.s }, p4/Z, [x9]\n"
"add x22, x22, #0x4\n"
- "addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z10.s, p4/M, z29.s, z0.s\n"
"fmla z14.s, p4/M, z29.s, z1.s\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"fmla z18.s, p4/M, z29.s, z2.s\n"
"fmla z22.s, p4/M, z29.s, z3.s\n"
"fmla z26.s, p4/M, z29.s, z4.s\n"
- "ld1w { z6.s }, p4/Z, [x12]\n"
"fmla z11.s, p4/M, z28.s, z0.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
+ "ld1w { z6.s }, p4/Z, [x12]\n"
"fmla z15.s, p4/M, z28.s, z1.s\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"fmla z19.s, p4/M, z28.s, z2.s\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"fmla z23.s, p4/M, z28.s, z3.s\n"
- "ld1rw { z3.s }, p4/Z, [x23]\n"
"fmla z27.s, p4/M, z28.s, z4.s\n"
+ "ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1w { z7.s }, p4/Z, [x11]\n"
"bgt 62b\n"
@@ -1073,15 +1071,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
+ "cmp x28, x20\n"
"addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
- "ld1w { z29.s }, p4/Z, [x10]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "addvl x10, x10, #1\n"
+ "ld1w { z29.s }, p4/Z, [x10]\n"
+ "addvl x11, x11, #1\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #1\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
"ld1w { z28.s }, p4/Z, [x9]\n"
@@ -1098,14 +1096,14 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z27.s, p4/M, z28.s, z4.s\n"
"bne 59b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z29.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z29.s }, p4/Z, [x21]\n"
"ld1rw { z28.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z29.s\n"
"fmin z9.s, p4/M, z9.s, z29.s\n"
@@ -1153,22 +1151,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x24]\n"
- "st1w { z21.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p3, [x23]\n"
- "st1w { z25.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x23]\n"
+ "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p3, [x22]\n"
+ "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1176,19 +1174,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 80f\n"
"66:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x18\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x18\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -1196,12 +1193,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 68f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 68f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 68f\n"
"mov x11, x12\n"
"68:" // Height 6: B setup done
@@ -1216,18 +1213,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"cbz x15, 69f\n"
"ld1w { z8.s }, p4/Z, [x15]\n"
"ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1243,17 +1240,17 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"69:" // Height 6: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x13, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "add x24, x13, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x24]\n"
"ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x23]\n"
@@ -1302,8 +1299,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"72:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1365,9 +1362,9 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z29.s, p4/M, z7.s, z5.s\n"
"ld1w { z7.s }, p4/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "addvl x9, x9, #1\n"
"fmla z10.s, p4/M, z6.s, z0.s\n"
"fmla z14.s, p4/M, z6.s, z1.s\n"
+ "addvl x9, x9, #1\n"
"fmla z18.s, p4/M, z6.s, z2.s\n"
"fmla z22.s, p4/M, z6.s, z3.s\n"
"fmla z26.s, p4/M, z6.s, z4.s\n"
@@ -1394,15 +1391,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
+ "cmp x28, x20\n"
"addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
"fmla z28.s, p4/M, z6.s, z5.s\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #1\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
@@ -1423,15 +1420,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"fmla z31.s, p4/M, z7.s, z5.s\n"
"bne 72b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p4/Z, [x21]\n"
"ld1rw { z0.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z1.s\n"
"fmin z9.s, p4/M, z9.s, z1.s\n"
@@ -1487,26 +1484,26 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x24]\n"
- "st1w { z21.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p3, [x23]\n"
- "st1w { z25.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p0, [x23, #3, MUL VL]\n"
- "st1w { z28.s }, p3, [x22]\n"
- "st1w { z29.s }, p2, [x22, #1, MUL VL]\n"
- "st1w { z30.s }, p1, [x22, #2, MUL VL]\n"
- "st1w { z31.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x23]\n"
+ "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p3, [x22]\n"
+ "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z28.s }, p3, [x21]\n"
+ "st1w { z29.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p0, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1523,8 +1520,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
index 299dec5b3c..8f12e9ee62 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
@@ -49,19 +49,18 @@ void sve_ffhybrid_fp32_mla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -82,7 +81,6 @@ void sve_ffhybrid_fp32_mla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -108,15 +106,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bgt 29f\n"
"beq 15f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -124,12 +122,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 3f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 3f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 3f\n"
"mov x11, x12\n"
"3:" // Height 1: B setup done
@@ -164,8 +162,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -181,113 +179,113 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x12]\n"
- "ld1w { z16.s }, p5/Z, [x11]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "fmla z8.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
"fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10]\n"
+ "fmla z10.s, z16.s, z0.s[0]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "fmla z10.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z11.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.s, z17.s, z0.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.s, z17.s, z0.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z11.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[2]\n"
"ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.s, z17.s, z0.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[2]\n"
"ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.s, z17.s, z0.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"fmla z11.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[3]\n"
"ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "fmla z8.s, z17.s, z0.s[3]\n"
- "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "sub x27, x27, #0x4\n"
"ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
+ "cmp x27, #0x4\n"
"fmla z10.s, z17.s, z0.s[3]\n"
"fmla z11.s, z16.s, z0.s[3]\n"
+ "add x26, x26, #0x10\n"
+ "addvl x12, x12, #4\n"
+ "addvl x11, x11, #4\n"
+ "addvl x10, x10, #4\n"
+ "addvl x9, x9, #4\n"
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1rqw { z0.s }, p0/Z, [x26]\n"
+ "ld1w { z16.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z17.s, z0.s[0]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "fmla z8.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z9.s, z16.s, z0.s[0]\n"
- "ld1w { z16.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z17.s, z0.s[0]\n"
- "fmla z11.s, z16.s, z0.s[0]\n"
"ble 12f\n"
"ld1w { z17.s }, p5/Z, [x12]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z17.s, z0.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[1]\n"
"fmla z11.s, z16.s, z0.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1w { z17.s }, p5/Z, [x12]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z17.s, z0.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[2]\n"
"fmla z11.s, z16.s, z0.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 12f\n"
"ld1w { z17.s }, p5/Z, [x12]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z17.s, z0.s[3]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
"fmla z10.s, z17.s, z0.s[3]\n"
"fmla z11.s, z16.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"12:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 7b\n"
"tbz %x[flags], #1, 13f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -310,15 +308,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 86f\n"
"15:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -326,12 +324,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 17f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 17f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 17f\n"
"mov x11, x12\n"
"17:" // Height 2: B setup done
@@ -346,22 +344,22 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cbz x15, 18f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x15, x15, #4\n"
"b 20f\n"
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x13, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "add x20, x13, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
@@ -380,8 +378,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -400,29 +398,29 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x12]\n"
- "ld1w { z16.s }, p5/Z, [x11]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
"fmla z8.s, z17.s, z1.s[0]\n"
"fmla z12.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
"fmla z9.s, z16.s, z1.s[0]\n"
"fmla z13.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
"fmla z10.s, z17.s, z1.s[0]\n"
"fmla z14.s, z17.s, z0.s[0]\n"
"ld1w { z17.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "cmp x27, #0x4\n"
"fmla z11.s, z16.s, z1.s[0]\n"
"fmla z15.s, z16.s, z0.s[0]\n"
"ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"fmla z8.s, z17.s, z1.s[1]\n"
"fmla z12.s, z17.s, z0.s[1]\n"
"ld1w { z17.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z16.s, z1.s[1]\n"
"fmla z13.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
@@ -461,89 +459,89 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x12]\n"
- "ld1w { z16.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
"fmla z8.s, z17.s, z0.s[0]\n"
"fmla z12.s, z17.s, z1.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[0]\n"
"fmla z13.s, z16.s, z1.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
"fmla z10.s, z17.s, z0.s[0]\n"
"fmla z14.s, z17.s, z1.s[0]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z11.s, z16.s, z0.s[0]\n"
"fmla z15.s, z16.s, z1.s[0]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1w { z17.s }, p5/Z, [x12]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z17.s, z0.s[1]\n"
"fmla z12.s, z17.s, z1.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[1]\n"
"fmla z13.s, z16.s, z1.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[1]\n"
"fmla z14.s, z17.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.s, z16.s, z0.s[1]\n"
"fmla z15.s, z16.s, z1.s[1]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1w { z17.s }, p5/Z, [x12]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z17.s, z0.s[2]\n"
"fmla z12.s, z17.s, z1.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[2]\n"
"fmla z13.s, z16.s, z1.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[2]\n"
"fmla z14.s, z17.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
"fmla z11.s, z16.s, z0.s[2]\n"
"fmla z15.s, z16.s, z1.s[2]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"ble 26f\n"
"ld1w { z17.s }, p5/Z, [x12]\n"
"ld1w { z16.s }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z17.s, z0.s[3]\n"
"fmla z12.s, z17.s, z1.s[3]\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z16.s, z0.s[3]\n"
"fmla z13.s, z16.s, z1.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
"fmla z10.s, z17.s, z0.s[3]\n"
"fmla z14.s, z17.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z11.s, z16.s, z0.s[3]\n"
"fmla z15.s, z16.s, z1.s[3]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"26:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 21b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -567,10 +565,10 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
"28:" // Height 2: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -578,15 +576,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 86f\n"
"29:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -594,12 +592,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 31f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 31f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 31f\n"
"mov x11, x12\n"
"31:" // Height 3: B setup done
@@ -614,27 +612,27 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cbz x15, 32f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 34f\n"
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x13, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "add x21, x13, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
@@ -661,8 +659,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -684,62 +682,62 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z21.s }, p5/Z, [x12]\n"
- "ld1w { z20.s }, p5/Z, [x11]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z0.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
"fmla z8.s, z21.s, z2.s[0]\n"
"fmla z12.s, z21.s, z1.s[0]\n"
- "fmla z9.s, z20.s, z2.s[0]\n"
- "fmla z13.s, z20.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
"fmla z16.s, z21.s, z0.s[0]\n"
+ "fmla z9.s, z20.s, z2.s[0]\n"
"ld1w { z21.s }, p5/Z, [x10]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
"fmla z17.s, z20.s, z0.s[0]\n"
"ld1w { z20.s }, p5/Z, [x9]\n"
+ "cmp x27, #0x4\n"
"fmla z10.s, z21.s, z2.s[0]\n"
"fmla z14.s, z21.s, z1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z18.s, z21.s, z0.s[0]\n"
- "ld1w { z21.s }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z11.s, z20.s, z2.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"fmla z15.s, z20.s, z1.s[0]\n"
"fmla z19.s, z20.s, z0.s[0]\n"
"ld1w { z20.s }, p5/Z, [x11, #1, MUL VL]\n"
"fmla z8.s, z21.s, z2.s[1]\n"
"fmla z12.s, z21.s, z1.s[1]\n"
"fmla z16.s, z21.s, z0.s[1]\n"
- "ld1w { z21.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z13.s, z20.s, z1.s[1]\n"
"fmla z17.s, z20.s, z0.s[1]\n"
"ld1w { z20.s }, p5/Z, [x9, #1, MUL VL]\n"
"fmla z10.s, z21.s, z2.s[1]\n"
"fmla z14.s, z21.s, z1.s[1]\n"
"fmla z18.s, z21.s, z0.s[1]\n"
- "ld1w { z21.s }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z11.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z15.s, z20.s, z1.s[1]\n"
"fmla z19.s, z20.s, z0.s[1]\n"
"ld1w { z20.s }, p5/Z, [x11, #2, MUL VL]\n"
"fmla z8.s, z21.s, z2.s[2]\n"
"fmla z12.s, z21.s, z1.s[2]\n"
"fmla z16.s, z21.s, z0.s[2]\n"
- "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z20.s, z1.s[2]\n"
"fmla z17.s, z20.s, z0.s[2]\n"
"ld1w { z20.s }, p5/Z, [x9, #2, MUL VL]\n"
"fmla z10.s, z21.s, z2.s[2]\n"
"fmla z14.s, z21.s, z1.s[2]\n"
"fmla z18.s, z21.s, z0.s[2]\n"
+ "fmla z11.s, z20.s, z2.s[2]\n"
"ld1w { z21.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z11.s, z20.s, z2.s[2]\n"
"fmla z15.s, z20.s, z1.s[2]\n"
"fmla z19.s, z20.s, z0.s[2]\n"
"ld1w { z20.s }, p5/Z, [x11, #3, MUL VL]\n"
@@ -747,9 +745,9 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"fmla z8.s, z21.s, z2.s[3]\n"
"fmla z12.s, z21.s, z1.s[3]\n"
"fmla z16.s, z21.s, z0.s[3]\n"
+ "fmla z9.s, z20.s, z2.s[3]\n"
"ld1w { z21.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z9.s, z20.s, z2.s[3]\n"
"fmla z13.s, z20.s, z1.s[3]\n"
"fmla z17.s, z20.s, z0.s[3]\n"
"ld1w { z20.s }, p5/Z, [x9, #3, MUL VL]\n"
@@ -763,91 +761,91 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z21.s }, p5/Z, [x12]\n"
- "ld1w { z20.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
"fmla z8.s, z21.s, z0.s[0]\n"
"fmla z12.s, z21.s, z1.s[0]\n"
- "fmla z9.s, z20.s, z0.s[0]\n"
- "fmla z13.s, z20.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
"fmla z16.s, z21.s, z2.s[0]\n"
+ "fmla z9.s, z20.s, z0.s[0]\n"
"ld1w { z21.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
"fmla z17.s, z20.s, z2.s[0]\n"
"ld1w { z20.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.s, z21.s, z0.s[0]\n"
"fmla z14.s, z21.s, z1.s[0]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.s, z21.s, z2.s[0]\n"
"fmla z11.s, z20.s, z0.s[0]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.s, z20.s, z1.s[0]\n"
"fmla z19.s, z20.s, z2.s[0]\n"
"ble 40f\n"
"ld1w { z21.s }, p5/Z, [x12]\n"
"ld1w { z20.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z21.s, z0.s[1]\n"
"fmla z12.s, z21.s, z1.s[1]\n"
"fmla z16.s, z21.s, z2.s[1]\n"
- "ld1w { z21.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z20.s, z0.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.s, z20.s, z1.s[1]\n"
"fmla z17.s, z20.s, z2.s[1]\n"
"ld1w { z20.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.s, z21.s, z0.s[1]\n"
"fmla z14.s, z21.s, z1.s[1]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.s, z21.s, z2.s[1]\n"
"fmla z11.s, z20.s, z0.s[1]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.s, z20.s, z1.s[1]\n"
"fmla z19.s, z20.s, z2.s[1]\n"
"ble 40f\n"
"ld1w { z21.s }, p5/Z, [x12]\n"
"ld1w { z20.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z21.s, z0.s[2]\n"
"fmla z12.s, z21.s, z1.s[2]\n"
"fmla z16.s, z21.s, z2.s[2]\n"
- "ld1w { z21.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z20.s, z0.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.s, z20.s, z1.s[2]\n"
"fmla z17.s, z20.s, z2.s[2]\n"
"ld1w { z20.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z10.s, z21.s, z0.s[2]\n"
"fmla z14.s, z21.s, z1.s[2]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z18.s, z21.s, z2.s[2]\n"
"fmla z11.s, z20.s, z0.s[2]\n"
+ "addvl x9, x9, #1\n"
"fmla z15.s, z20.s, z1.s[2]\n"
"fmla z19.s, z20.s, z2.s[2]\n"
"ble 40f\n"
"ld1w { z21.s }, p5/Z, [x12]\n"
"ld1w { z20.s }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z21.s, z0.s[3]\n"
"fmla z12.s, z21.s, z1.s[3]\n"
"fmla z16.s, z21.s, z2.s[3]\n"
- "ld1w { z21.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
"fmla z9.s, z20.s, z0.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "addvl x12, x12, #1\n"
"fmla z13.s, z20.s, z1.s[3]\n"
"fmla z17.s, z20.s, z2.s[3]\n"
"ld1w { z20.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z10.s, z21.s, z0.s[3]\n"
"fmla z14.s, z21.s, z1.s[3]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
"fmla z18.s, z21.s, z2.s[3]\n"
"fmla z11.s, z20.s, z0.s[3]\n"
"fmla z15.s, z20.s, z1.s[3]\n"
@@ -858,12 +856,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 41f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p5/Z, [x21]\n"
"ld1rw { z20.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z21.s\n"
"fmin z9.s, p5/M, z9.s, z21.s\n"
@@ -895,14 +893,14 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
"42:" // Height 3: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -910,15 +908,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 86f\n"
"43:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -926,12 +924,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 45f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 45f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 45f\n"
"mov x11, x12\n"
"45:" // Height 4: B setup done
@@ -946,18 +944,18 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cbz x15, 46f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -965,13 +963,13 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x13, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "add x22, x13, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
@@ -1006,8 +1004,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1032,25 +1030,25 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z25.s }, p5/Z, [x12]\n"
- "ld1w { z24.s }, p5/Z, [x11]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z3.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z1.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
"fmla z8.s, z25.s, z3.s[0]\n"
"fmla z12.s, z25.s, z2.s[0]\n"
- "fmla z9.s, z24.s, z3.s[0]\n"
- "fmla z13.s, z24.s, z2.s[0]\n"
"fmla z16.s, z25.s, z1.s[0]\n"
"fmla z20.s, z25.s, z0.s[0]\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z9.s, z24.s, z3.s[0]\n"
+ "fmla z13.s, z24.s, z2.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z17.s, z24.s, z1.s[0]\n"
"fmla z21.s, z24.s, z0.s[0]\n"
"ld1w { z24.s }, p5/Z, [x9]\n"
@@ -1129,22 +1127,22 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z25.s }, p5/Z, [x12]\n"
- "ld1w { z24.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
"fmla z8.s, z25.s, z0.s[0]\n"
"fmla z12.s, z25.s, z1.s[0]\n"
- "fmla z9.s, z24.s, z0.s[0]\n"
- "fmla z13.s, z24.s, z1.s[0]\n"
"fmla z16.s, z25.s, z2.s[0]\n"
"fmla z20.s, z25.s, z3.s[0]\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z9.s, z24.s, z0.s[0]\n"
+ "fmla z13.s, z24.s, z1.s[0]\n"
+ "addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"fmla z17.s, z24.s, z2.s[0]\n"
"fmla z21.s, z24.s, z3.s[0]\n"
@@ -1161,23 +1159,23 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 54f\n"
"ld1w { z25.s }, p5/Z, [x12]\n"
"ld1w { z24.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z25.s, z0.s[1]\n"
"fmla z12.s, z25.s, z1.s[1]\n"
"fmla z16.s, z25.s, z2.s[1]\n"
"fmla z20.s, z25.s, z3.s[1]\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.s, z24.s, z0.s[1]\n"
"fmla z13.s, z24.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.s, z24.s, z2.s[1]\n"
"fmla z21.s, z24.s, z3.s[1]\n"
"ld1w { z24.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.s, z25.s, z0.s[1]\n"
"fmla z14.s, z25.s, z1.s[1]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.s, z25.s, z2.s[1]\n"
"fmla z22.s, z25.s, z3.s[1]\n"
"fmla z11.s, z24.s, z0.s[1]\n"
@@ -1187,23 +1185,23 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 54f\n"
"ld1w { z25.s }, p5/Z, [x12]\n"
"ld1w { z24.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z25.s, z0.s[2]\n"
"fmla z12.s, z25.s, z1.s[2]\n"
"fmla z16.s, z25.s, z2.s[2]\n"
"fmla z20.s, z25.s, z3.s[2]\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.s, z24.s, z0.s[2]\n"
"fmla z13.s, z24.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z17.s, z24.s, z2.s[2]\n"
"fmla z21.s, z24.s, z3.s[2]\n"
"ld1w { z24.s }, p5/Z, [x9]\n"
- "addvl x9, x9, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z10.s, z25.s, z0.s[2]\n"
"fmla z14.s, z25.s, z1.s[2]\n"
+ "addvl x9, x9, #1\n"
"fmla z18.s, z25.s, z2.s[2]\n"
"fmla z22.s, z25.s, z3.s[2]\n"
"fmla z11.s, z24.s, z0.s[2]\n"
@@ -1213,16 +1211,16 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 54f\n"
"ld1w { z25.s }, p5/Z, [x12]\n"
"ld1w { z24.s }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z25.s, z0.s[3]\n"
"fmla z12.s, z25.s, z1.s[3]\n"
"fmla z16.s, z25.s, z2.s[3]\n"
"fmla z20.s, z25.s, z3.s[3]\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x12, x12, #1\n"
"fmla z9.s, z24.s, z0.s[3]\n"
"fmla z13.s, z24.s, z1.s[3]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
"fmla z17.s, z24.s, z2.s[3]\n"
"fmla z21.s, z24.s, z3.s[3]\n"
"ld1w { z24.s }, p5/Z, [x9]\n"
@@ -1241,13 +1239,13 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 55f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p5/Z, [x21]\n"
"ld1rw { z24.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z25.s\n"
"fmin z9.s, p5/M, z9.s, z25.s\n"
@@ -1287,18 +1285,18 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
"56:" // Height 4: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1306,15 +1304,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 86f\n"
"57:" // Height 5
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"58:" // Height 5: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -1322,12 +1320,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 59f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 59f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 59f\n"
"mov x11, x12\n"
"59:" // Height 5: B setup done
@@ -1342,18 +1340,18 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cbz x15, 60f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1365,16 +1363,16 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"60:" // Height 5: no bias
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "add x23, x13, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22]\n"
@@ -1415,8 +1413,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"63:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1444,29 +1442,29 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 67f\n"
"66:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z29.s }, p5/Z, [x12]\n"
- "ld1w { z28.s }, p5/Z, [x11]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z1.s }, p0/Z, [x23]\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
"fmla z8.s, z29.s, z4.s[0]\n"
"fmla z12.s, z29.s, z3.s[0]\n"
- "fmla z9.s, z28.s, z4.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
"fmla z16.s, z29.s, z2.s[0]\n"
"fmla z20.s, z29.s, z1.s[0]\n"
+ "add x25, x25, #0x10\n"
"fmla z24.s, z29.s, z0.s[0]\n"
- "fmla z13.s, z28.s, z3.s[0]\n"
+ "fmla z9.s, z28.s, z4.s[0]\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z28.s, z3.s[0]\n"
"fmla z17.s, z28.s, z2.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z21.s, z28.s, z1.s[0]\n"
"fmla z25.s, z28.s, z0.s[0]\n"
"ld1w { z28.s }, p5/Z, [x9]\n"
@@ -1475,8 +1473,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"fmla z18.s, z29.s, z2.s[0]\n"
"fmla z22.s, z29.s, z1.s[0]\n"
"fmla z26.s, z29.s, z0.s[0]\n"
- "ld1w { z29.s }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z11.s, z28.s, z4.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x12, #1, MUL VL]\n"
"fmla z15.s, z28.s, z3.s[0]\n"
"fmla z19.s, z28.s, z2.s[0]\n"
"fmla z23.s, z28.s, z1.s[0]\n"
@@ -1487,8 +1485,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"fmla z16.s, z29.s, z2.s[1]\n"
"fmla z20.s, z29.s, z1.s[1]\n"
"fmla z24.s, z29.s, z0.s[1]\n"
- "ld1w { z29.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z13.s, z28.s, z3.s[1]\n"
"fmla z17.s, z28.s, z2.s[1]\n"
"fmla z21.s, z28.s, z1.s[1]\n"
@@ -1499,8 +1497,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"fmla z18.s, z29.s, z2.s[1]\n"
"fmla z22.s, z29.s, z1.s[1]\n"
"fmla z26.s, z29.s, z0.s[1]\n"
- "ld1w { z29.s }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z11.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x12, #2, MUL VL]\n"
"fmla z15.s, z28.s, z3.s[1]\n"
"fmla z19.s, z28.s, z2.s[1]\n"
"fmla z23.s, z28.s, z1.s[1]\n"
@@ -1511,8 +1509,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"fmla z16.s, z29.s, z2.s[2]\n"
"fmla z20.s, z29.s, z1.s[2]\n"
"fmla z24.s, z29.s, z0.s[2]\n"
- "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z28.s, z3.s[2]\n"
"fmla z17.s, z28.s, z2.s[2]\n"
"fmla z21.s, z28.s, z1.s[2]\n"
@@ -1523,30 +1521,30 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"fmla z18.s, z29.s, z2.s[2]\n"
"fmla z22.s, z29.s, z1.s[2]\n"
"fmla z26.s, z29.s, z0.s[2]\n"
+ "fmla z11.s, z28.s, z4.s[2]\n"
"ld1w { z29.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z11.s, z28.s, z4.s[2]\n"
"fmla z15.s, z28.s, z3.s[2]\n"
"fmla z19.s, z28.s, z2.s[2]\n"
"fmla z23.s, z28.s, z1.s[2]\n"
"fmla z27.s, z28.s, z0.s[2]\n"
"ld1w { z28.s }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.s, z29.s, z4.s[3]\n"
"addvl x11, x11, #4\n"
+ "fmla z8.s, z29.s, z4.s[3]\n"
"fmla z12.s, z29.s, z3.s[3]\n"
"fmla z16.s, z29.s, z2.s[3]\n"
"fmla z20.s, z29.s, z1.s[3]\n"
"fmla z24.s, z29.s, z0.s[3]\n"
+ "fmla z9.s, z28.s, z4.s[3]\n"
"ld1w { z29.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z9.s, z28.s, z4.s[3]\n"
"fmla z13.s, z28.s, z3.s[3]\n"
"fmla z17.s, z28.s, z2.s[3]\n"
"fmla z21.s, z28.s, z1.s[3]\n"
"fmla z25.s, z28.s, z0.s[3]\n"
"ld1w { z28.s }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.s, z29.s, z4.s[3]\n"
"addvl x9, x9, #4\n"
+ "fmla z10.s, z29.s, z4.s[3]\n"
"fmla z14.s, z29.s, z3.s[3]\n"
"fmla z18.s, z29.s, z2.s[3]\n"
"fmla z22.s, z29.s, z1.s[3]\n"
@@ -1559,25 +1557,25 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bgt 66b\n"
"67:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z29.s }, p5/Z, [x12]\n"
- "ld1w { z28.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
"fmla z8.s, z29.s, z0.s[0]\n"
"fmla z12.s, z29.s, z1.s[0]\n"
- "fmla z9.s, z28.s, z0.s[0]\n"
- "fmla z13.s, z28.s, z1.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
"fmla z16.s, z29.s, z2.s[0]\n"
"fmla z20.s, z29.s, z3.s[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z24.s, z29.s, z4.s[0]\n"
- "fmla z17.s, z28.s, z2.s[0]\n"
+ "fmla z9.s, z28.s, z0.s[0]\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z13.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z2.s[0]\n"
"addvl x10, x10, #1\n"
"fmla z21.s, z28.s, z3.s[0]\n"
"fmla z25.s, z28.s, z4.s[0]\n"
@@ -1596,19 +1594,19 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 68f\n"
"ld1w { z29.s }, p5/Z, [x12]\n"
"ld1w { z28.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z29.s, z0.s[1]\n"
"fmla z12.s, z29.s, z1.s[1]\n"
"fmla z16.s, z29.s, z2.s[1]\n"
"fmla z20.s, z29.s, z3.s[1]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.s, z29.s, z4.s[1]\n"
"fmla z9.s, z28.s, z0.s[1]\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.s, z28.s, z1.s[1]\n"
"fmla z17.s, z28.s, z2.s[1]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.s, z28.s, z3.s[1]\n"
"fmla z25.s, z28.s, z4.s[1]\n"
"ld1w { z28.s }, p5/Z, [x9]\n"
@@ -1626,19 +1624,19 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 68f\n"
"ld1w { z29.s }, p5/Z, [x12]\n"
"ld1w { z28.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z29.s, z0.s[2]\n"
"fmla z12.s, z29.s, z1.s[2]\n"
"fmla z16.s, z29.s, z2.s[2]\n"
"fmla z20.s, z29.s, z3.s[2]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.s, z29.s, z4.s[2]\n"
"fmla z9.s, z28.s, z0.s[2]\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z13.s, z28.s, z1.s[2]\n"
"fmla z17.s, z28.s, z2.s[2]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.s, z28.s, z3.s[2]\n"
"fmla z25.s, z28.s, z4.s[2]\n"
"ld1w { z28.s }, p5/Z, [x9]\n"
@@ -1656,12 +1654,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 68f\n"
"ld1w { z29.s }, p5/Z, [x12]\n"
"ld1w { z28.s }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z29.s, z0.s[3]\n"
"fmla z12.s, z29.s, z1.s[3]\n"
"fmla z16.s, z29.s, z2.s[3]\n"
"fmla z20.s, z29.s, z3.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z24.s, z29.s, z4.s[3]\n"
"fmla z9.s, z28.s, z0.s[3]\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
@@ -1688,14 +1686,14 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 63b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 69f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z29.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z29.s }, p5/Z, [x21]\n"
"ld1rw { z28.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z29.s\n"
"fmin z9.s, p5/M, z9.s, z29.s\n"
@@ -1743,22 +1741,22 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
"70:" // Height 5: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -1766,19 +1764,18 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 86f\n"
"71:" // Height 6
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x18\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x18\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"72:" // Height 6: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #3\n"
"add x11, x12, x20, LSL #2\n"
+ "cntw x21, ALL, MUL #3\n"
"add x10, x11, x20, LSL #2\n"
"add x9, x10, x20, LSL #2\n"
"add x20, x9, x20, LSL #2\n"
@@ -1786,12 +1783,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 73f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 73f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 73f\n"
"mov x11, x12\n"
"73:" // Height 6: B setup done
@@ -1806,18 +1803,18 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cbz x15, 74f\n"
"ld1w { z8.s }, p5/Z, [x15]\n"
"ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x15, x15, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1833,17 +1830,17 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"74:" // Height 6: no bias
"tbz %x[flags], #0, 75f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x13, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "add x24, x13, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1892,8 +1889,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"77:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 78f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1924,29 +1921,29 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 81f\n"
"80:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z1.s }, p5/Z, [x12]\n"
- "ld1w { z0.s }, p5/Z, [x11]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z7.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z6.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
"ld1rqw { z3.s }, p0/Z, [x22]\n"
"ld1rqw { z2.s }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1w { z1.s }, p5/Z, [x12]\n"
+ "ld1w { z0.s }, p5/Z, [x11]\n"
"fmla z8.s, z1.s, z7.s[0]\n"
"fmla z12.s, z1.s, z6.s[0]\n"
- "add x21, x21, #0x10\n"
"fmla z16.s, z1.s, z5.s[0]\n"
"fmla z20.s, z1.s, z4.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z24.s, z1.s, z3.s[0]\n"
"fmla z28.s, z1.s, z2.s[0]\n"
"ld1w { z1.s }, p5/Z, [x10]\n"
+ "add x21, x21, #0x10\n"
"fmla z9.s, z0.s, z7.s[0]\n"
"fmla z13.s, z0.s, z6.s[0]\n"
"fmla z17.s, z0.s, z5.s[0]\n"
@@ -2057,27 +2054,27 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bgt 80b\n"
"81:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z7.s }, p5/Z, [x12]\n"
- "ld1w { z6.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
"ld1rqw { z5.s }, p0/Z, [x21]\n"
+ "ld1w { z7.s }, p5/Z, [x12]\n"
+ "ld1w { z6.s }, p5/Z, [x11]\n"
"fmla z8.s, z7.s, z0.s[0]\n"
"fmla z12.s, z7.s, z1.s[0]\n"
- "fmla z9.s, z6.s, z0.s[0]\n"
- "fmla z13.s, z6.s, z1.s[0]\n"
"fmla z16.s, z7.s, z2.s[0]\n"
"fmla z20.s, z7.s, z3.s[0]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z24.s, z7.s, z4.s[0]\n"
"fmla z28.s, z7.s, z5.s[0]\n"
"ld1w { z7.s }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
+ "fmla z9.s, z6.s, z0.s[0]\n"
+ "fmla z13.s, z6.s, z1.s[0]\n"
"fmla z17.s, z6.s, z2.s[0]\n"
"fmla z21.s, z6.s, z3.s[0]\n"
"fmla z25.s, z6.s, z4.s[0]\n"
@@ -2099,19 +2096,19 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 82f\n"
"ld1w { z7.s }, p5/Z, [x12]\n"
"ld1w { z6.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z7.s, z0.s[1]\n"
"fmla z12.s, z7.s, z1.s[1]\n"
"fmla z16.s, z7.s, z2.s[1]\n"
"fmla z20.s, z7.s, z3.s[1]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.s, z7.s, z4.s[1]\n"
"fmla z28.s, z7.s, z5.s[1]\n"
"ld1w { z7.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.s, z6.s, z0.s[1]\n"
"fmla z13.s, z6.s, z1.s[1]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.s, z6.s, z2.s[1]\n"
"fmla z21.s, z6.s, z3.s[1]\n"
"fmla z25.s, z6.s, z4.s[1]\n"
@@ -2133,19 +2130,19 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 82f\n"
"ld1w { z7.s }, p5/Z, [x12]\n"
"ld1w { z6.s }, p5/Z, [x11]\n"
- "subs x27, x27, #0x1\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z7.s, z0.s[2]\n"
"fmla z12.s, z7.s, z1.s[2]\n"
"fmla z16.s, z7.s, z2.s[2]\n"
"fmla z20.s, z7.s, z3.s[2]\n"
+ "subs x27, x27, #0x1\n"
+ "addvl x12, x12, #1\n"
"fmla z24.s, z7.s, z4.s[2]\n"
"fmla z28.s, z7.s, z5.s[2]\n"
"ld1w { z7.s }, p5/Z, [x10]\n"
- "addvl x10, x10, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z9.s, z6.s, z0.s[2]\n"
"fmla z13.s, z6.s, z1.s[2]\n"
+ "addvl x10, x10, #1\n"
"fmla z17.s, z6.s, z2.s[2]\n"
"fmla z21.s, z6.s, z3.s[2]\n"
"fmla z25.s, z6.s, z4.s[2]\n"
@@ -2167,12 +2164,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ble 82f\n"
"ld1w { z7.s }, p5/Z, [x12]\n"
"ld1w { z6.s }, p5/Z, [x11]\n"
- "addvl x12, x12, #1\n"
- "addvl x11, x11, #1\n"
"fmla z8.s, z7.s, z0.s[3]\n"
"fmla z12.s, z7.s, z1.s[3]\n"
"fmla z16.s, z7.s, z2.s[3]\n"
"fmla z20.s, z7.s, z3.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
"fmla z24.s, z7.s, z4.s[3]\n"
"fmla z28.s, z7.s, z5.s[3]\n"
"ld1w { z7.s }, p5/Z, [x10]\n"
@@ -2203,15 +2200,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 77b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x13, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 83f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p5/Z, [x21]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z1.s\n"
"fmin z9.s, p5/M, z9.s, z1.s\n"
@@ -2267,26 +2264,26 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
"addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z28.s }, p4, [x22]\n"
- "st1w { z29.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
"84:" // Height 6: Writeback done
"decw x14, ALL, MUL #4\n"
"cmp x14, XZR\n"
@@ -2303,8 +2300,8 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"86:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 90112a823b..23f686a902 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -82,14 +82,16 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 12, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 4, 12, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
+ case CPUModel::V1:
+ return { 28.74 };
default:
- return { 32.35 };
+ return { 15.27 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
index 0e98cc6def..c2b6dd1030 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -50,19 +50,18 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -83,7 +82,6 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
ka.B_stride = B_stride;
switch(act.type) {
default:
@@ -106,17 +104,17 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"bgt 29f\n"
"beq 15f\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #5\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #5\n"
"add x28, x9, x20, LSL #1\n"
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
@@ -124,20 +122,20 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 3f\n"
"decw x21\n"
- "mov x27, x12\n"
"cmp x14, x21\n"
+ "mov x27, x12\n"
"bgt 3f\n"
"decw x21\n"
- "mov x28, x12\n"
"cmp x14, x21\n"
+ "mov x28, x12\n"
"bgt 3f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 3f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 3f\n"
"mov x11, x12\n"
"3:" // Height 1: B setup done
@@ -156,19 +154,19 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x15, 4f\n"
"ld1w { z8.s }, p7/Z, [x15]\n"
"ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x15, x15, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -176,16 +174,16 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 6f\n"
"4:" // Height 1: no bias
"tbz %x[flags], #0, 5f\n"
- "ld1w { z25.s }, p6/Z, [x13]\n"
- "ld1w { z24.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x13]\n"
+ "ld1w { z20.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "zip1 z8.d, z21.d, z14.d\n"
+ "zip2 z14.d, z21.d, z14.d\n"
"ld1w { z23.s }, p4/Z, [x13, #2, MUL VL]\n"
"ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "zip1 z9.d, z20.d, z15.d\n"
+ "zip2 z15.d, z20.d, z15.d\n"
"ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
- "zip1 z8.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
"zip1 z10.d, z23.d, z16.d\n"
"zip2 z16.d, z23.d, z16.d\n"
"zip1 z11.d, z22.d, z17.d\n"
@@ -212,8 +210,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -229,78 +227,78 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x12]\n"
- "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z21.h }, p7/Z, [x11]\n"
- "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
- "cmp x25, #0x4\n"
- "addvl x12, x12, #2\n"
- "addvl x11, x11, #2\n"
"ld1rqw { z24.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
"uzp1 z24.h, z24.h, z24.h\n"
- ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x10]\n"
- ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
- "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
+ "ld1h { z21.h }, p7/Z, [x12]\n"
+ "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6475e708 // bfmmla z8.s, z24.h, z21.h\n"
+ ".inst 0x6474e70e // bfmmla z14.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
- "ld1h { z21.h }, p7/Z, [x9]\n"
".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x10]\n"
+ "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6475e70a // bfmmla z10.s, z24.h, z21.h\n"
+ ".inst 0x6474e710 // bfmmla z16.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x9]\n"
"ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
"ld1h { z23.h }, p7/Z, [x28]\n"
- ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
"ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
"ld1h { z21.h }, p7/Z, [x27]\n"
- ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
"ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
- "addvl x27, x27, #2\n"
".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
+ "add x24, x24, #0x10\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "addvl x28, x28, #2\n"
+ "addvl x27, x27, #2\n"
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x12]\n"
- "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
+ "ld1rqw { z22.s }, p0/Z, [x24]\n"
+ ".inst 0x658abed6 // bfcvt z22.h, p7/M, z22.s\n"
+ "uzp1 z22.h, z22.h, z22.h\n"
+ "ld1h { z21.h }, p7/Z, [x12]\n"
+ "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6475e6c8 // bfmmla z8.s, z22.h, z21.h\n"
+ ".inst 0x6474e6ce // bfmmla z14.s, z22.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x11]\n"
"ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
- "ld1rqw { z24.s }, p0/Z, [x24]\n"
- ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
- "uzp1 z24.h, z24.h, z24.h\n"
- ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x10]\n"
- ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
- "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
- ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6475e6c9 // bfmmla z9.s, z22.h, z21.h\n"
+ ".inst 0x6474e6cf // bfmmla z15.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x10]\n"
+ "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6475e6ca // bfmmla z10.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d0 // bfmmla z16.s, z22.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x9]\n"
- ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
"ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
- ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
- "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6475e6cb // bfmmla z11.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d1 // bfmmla z17.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e6cc // bfmmla z12.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d2 // bfmmla z18.s, z22.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x27]\n"
- ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
"ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6475e6cd // bfmmla z13.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d3 // bfmmla z19.s, z22.h, z20.h\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "addvl x28, x28, #2\n"
"addvl x27, x27, #2\n"
- ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
- ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
- ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
- ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"12:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -313,9 +311,9 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z12.d, z12.d, z18.d\n"
"uzp1 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 13f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p7/Z, [x21]\n"
"ld1rw { z20.s }, p7/Z, [x20]\n"
"fmin z8.s, p7/M, z8.s, z21.s\n"
"fmin z9.s, p7/M, z9.s, z21.s\n"
@@ -344,17 +342,17 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 58f\n"
"15:" // Height 2
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #5\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #5\n"
"add x28, x9, x20, LSL #1\n"
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
@@ -362,20 +360,20 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 17f\n"
"decw x21\n"
- "mov x27, x12\n"
"cmp x14, x21\n"
+ "mov x27, x12\n"
"bgt 17f\n"
"decw x21\n"
- "mov x28, x12\n"
"cmp x14, x21\n"
+ "mov x28, x12\n"
"bgt 17f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 17f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 17f\n"
"mov x11, x12\n"
"17:" // Height 2: B setup done
@@ -394,19 +392,19 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x15, 18f\n"
"ld1w { z8.s }, p7/Z, [x15]\n"
"ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x15, x15, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -415,25 +413,25 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z25.s }, p6/Z, [x13]\n"
- "ld1w { z24.s }, p5/Z, [x13, #1, MUL VL]\n"
- "ld1w { z23.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "add x20, x13, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x13]\n"
+ "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
"ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
"ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
- "add x20, x13, x20, LSL #2\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
"ld1w { z14.s }, p6/Z, [x20]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
"ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
"ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
"ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
"ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
- "zip1 z8.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
- "zip1 z10.d, z23.d, z16.d\n"
- "zip2 z16.d, z23.d, z16.d\n"
"zip1 z11.d, z22.d, z17.d\n"
"zip2 z17.d, z22.d, z17.d\n"
"zip1 z12.d, z21.d, z18.d\n"
@@ -458,8 +456,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -478,87 +476,87 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x12]\n"
- "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z21.h }, p7/Z, [x11]\n"
- "ld1h { z25.h }, p7/Z, [x11, #1, MUL VL]\n"
- "cmp x25, #0x4\n"
- "addvl x12, x12, #2\n"
- "addvl x11, x11, #2\n"
"ld1rqw { z24.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z20.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
"uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x12]\n"
+ "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
"uzp1 z20.h, z20.h, z20.h\n"
"trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x10]\n"
".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x10]\n"
"ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x9]\n"
- ".inst 0x6479e70f // bfmmla z15.s, z24.h, z25.h\n"
"ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
"ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
- "ld1h { z21.h }, p7/Z, [x27]\n"
".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x27]\n"
"ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
- "addvl x27, x27, #2\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "addvl x28, x28, #2\n"
+ "addvl x27, x27, #2\n"
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x12]\n"
- "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
- "ld1h { z21.h }, p7/Z, [x11]\n"
- "ld1h { z25.h }, p7/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
"ld1rqw { z24.s }, p0/Z, [x24]\n"
"ld1rqw { z20.s }, p0/Z, [x23]\n"
".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
"uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x12]\n"
+ "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
"uzp1 z20.h, z20.h, z20.h\n"
"trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x10]\n"
".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x10]\n"
"ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x9]\n"
- ".inst 0x6479e70f // bfmmla z15.s, z24.h, z25.h\n"
"ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
- "addvl x9, x9, #2\n"
".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
"ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
- "ld1h { z21.h }, p7/Z, [x27]\n"
".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x27]\n"
"ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
- "addvl x27, x27, #2\n"
".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "addvl x28, x28, #2\n"
+ "addvl x27, x27, #2\n"
"26:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -567,21 +565,21 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z4.d, z8.d, z14.d\n"
"uzp2 z8.d, z8.d, z14.d\n"
+ "add x23, x13, x20, LSL #2\n"
"uzp1 z14.d, z9.d, z15.d\n"
"uzp2 z9.d, z9.d, z15.d\n"
"uzp1 z15.d, z10.d, z16.d\n"
"uzp2 z10.d, z10.d, z16.d\n"
"uzp1 z16.d, z11.d, z17.d\n"
"uzp2 z11.d, z11.d, z17.d\n"
- "add x24, x13, x20, LSL #2\n"
"uzp1 z17.d, z12.d, z18.d\n"
"uzp2 z12.d, z12.d, z18.d\n"
"uzp1 z18.d, z13.d, z19.d\n"
"uzp2 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 27f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z20.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z20.s }, p7/Z, [x21]\n"
"ld1rw { z19.s }, p7/Z, [x20]\n"
"fmin z4.s, p7/M, z4.s, z20.s\n"
"fmin z14.s, p7/M, z14.s, z20.s\n"
@@ -615,12 +613,12 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"st1w { z17.s }, p2, [x13, #4, MUL VL]\n"
"st1w { z18.s }, p1, [x13, #5, MUL VL]\n"
"addvl x13, x13, #6\n"
- "st1w { z8.s }, p6, [x24]\n"
- "st1w { z9.s }, p5, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p4, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p3, [x24, #3, MUL VL]\n"
- "st1w { z12.s }, p2, [x24, #4, MUL VL]\n"
- "st1w { z13.s }, p1, [x24, #5, MUL VL]\n"
+ "st1w { z8.s }, p6, [x23]\n"
+ "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+ "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+ "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+ "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
"28:" // Height 2: Writeback done
"decw x14, ALL, MUL #6\n"
"cmp x14, XZR\n"
@@ -628,17 +626,17 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 58f\n"
"29:" // Height 3
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x15, %x[bias]\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #5\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #5\n"
"add x28, x9, x20, LSL #1\n"
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
@@ -646,20 +644,20 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 31f\n"
"decw x21\n"
- "mov x27, x12\n"
"cmp x14, x21\n"
+ "mov x27, x12\n"
"bgt 31f\n"
"decw x21\n"
- "mov x28, x12\n"
"cmp x14, x21\n"
+ "mov x28, x12\n"
"bgt 31f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 31f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 31f\n"
"mov x11, x12\n"
"31:" // Height 3: B setup done
@@ -678,19 +676,19 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x15, 32f\n"
"ld1w { z8.s }, p7/Z, [x15]\n"
"ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x15, x15, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -711,38 +709,38 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z22.s }, p6/Z, [x13]\n"
- "ld1w { z24.s }, p5/Z, [x13, #1, MUL VL]\n"
- "ld1w { z0.s }, p4/Z, [x13, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x13, #4, MUL VL]\n"
"add x21, x13, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x13]\n"
+ "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
"ld1w { z14.s }, p6/Z, [x21]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
"ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
"ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
"ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
"ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
"ld1w { z21.s }, p6/Z, [x20]\n"
- "zip1 z8.d, z22.d, z14.d\n"
- "zip2 z14.d, z22.d, z14.d\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
"ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
"ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
"ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
"ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
- "zip1 z10.d, z0.d, z16.d\n"
- "zip2 z16.d, z0.d, z16.d\n"
- "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
- "zip1 z11.d, z2.d, z17.d\n"
- "zip2 z17.d, z2.d, z17.d\n"
- "zip1 z12.d, z1.d, z18.d\n"
- "zip2 z18.d, z1.d, z18.d\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
"zip1 z21.d, z22.d, z27.d\n"
@@ -785,8 +783,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -808,111 +806,111 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x12]\n"
- "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z1.h }, p7/Z, [x11]\n"
- "ld1h { z6.h }, p7/Z, [x11, #1, MUL VL]\n"
- "cmp x25, #0x4\n"
- "addvl x12, x12, #2\n"
- "addvl x11, x11, #2\n"
"ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
"uzp1 z5.h, z5.h, z5.h\n"
- ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
"uzp1 z0.h, z0.h, z0.h\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
"trn1 z5.d, z5.d, z0.d\n"
"uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
- "ld1h { z3.h }, p7/Z, [x10]\n"
".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x10]\n"
+ "sub x25, x25, #0x4\n"
".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
- "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
+ "cmp x25, #0x4\n"
".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
- ".inst 0x6466e4af // bfmmla z15.s, z5.h, z6.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x9]\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
- "addvl x9, x9, #2\n"
+ "add x23, x23, #0x10\n"
".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
+ "addvl x12, x12, #2\n"
".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x27]\n"
+ "addvl x11, x11, #2\n"
".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
- "addvl x27, x27, #2\n"
+ "addvl x10, x10, #2\n"
".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
+ "addvl x9, x9, #2\n"
+ "addvl x28, x28, #2\n"
".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
+ "addvl x27, x27, #2\n"
".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x12]\n"
- "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z1.h }, p7/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
"ld1rqw { z5.s }, p0/Z, [x24]\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
"uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
"uzp1 z0.h, z0.h, z0.h\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
"trn1 z5.d, z5.d, z0.d\n"
"uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6466e4a9 // bfmmla z9.s, z5.h, z6.h\n"
- ".inst 0x6461e4af // bfmmla z15.s, z5.h, z1.h\n"
".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x10]\n"
+ "addvl x12, x12, #2\n"
".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6466e495 // bfmmla z21.s, z4.h, z6.h\n"
- ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x9]\n"
- "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
"addvl x10, x10, #2\n"
- "addvl x9, x9, #2\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
+ "addvl x9, x9, #2\n"
".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
- ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28]\n"
- "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
- ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
- "ld1h { z1.h }, p7/Z, [x27]\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
"addvl x28, x28, #2\n"
+ ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x27]\n"
".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
- "addvl x27, x27, #2\n"
".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
+ "addvl x27, x27, #2\n"
".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
@@ -926,16 +924,16 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"cmp x26, x20\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
"uzp1 z4.d, z8.d, z14.d\n"
"uzp2 z8.d, z8.d, z14.d\n"
"uzp1 z14.d, z9.d, z15.d\n"
"uzp2 z9.d, z9.d, z15.d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 z15.d, z10.d, z16.d\n"
"uzp2 z10.d, z10.d, z16.d\n"
- "add x24, x13, x20, LSL #2\n"
"uzp1 z16.d, z11.d, z17.d\n"
"uzp2 z11.d, z11.d, z17.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z17.d, z12.d, z18.d\n"
"uzp2 z12.d, z12.d, z18.d\n"
"uzp1 z18.d, z13.d, z19.d\n"
@@ -947,9 +945,9 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z24.d, z24.d, z30.d\n"
"uzp1 z25.d, z25.d, z31.d\n"
"tbz %x[flags], #1, 41f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p7/Z, [x21]\n"
"ld1rw { z19.s }, p7/Z, [x20]\n"
"fmin z4.s, p7/M, z4.s, z0.s\n"
"fmin z14.s, p7/M, z14.s, z0.s\n"
@@ -995,18 +993,18 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"st1w { z17.s }, p2, [x13, #4, MUL VL]\n"
"st1w { z18.s }, p1, [x13, #5, MUL VL]\n"
"addvl x13, x13, #6\n"
- "st1w { z8.s }, p6, [x24]\n"
- "st1w { z9.s }, p5, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p4, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p3, [x24, #3, MUL VL]\n"
- "st1w { z12.s }, p2, [x24, #4, MUL VL]\n"
- "st1w { z13.s }, p1, [x24, #5, MUL VL]\n"
- "st1w { z20.s }, p6, [x23]\n"
- "st1w { z21.s }, p5, [x23, #1, MUL VL]\n"
- "st1w { z22.s }, p4, [x23, #2, MUL VL]\n"
- "st1w { z23.s }, p3, [x23, #3, MUL VL]\n"
- "st1w { z24.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z25.s }, p1, [x23, #5, MUL VL]\n"
+ "st1w { z8.s }, p6, [x23]\n"
+ "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+ "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+ "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+ "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+ "st1w { z20.s }, p6, [x22]\n"
+ "st1w { z21.s }, p5, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p4, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p3, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p2, [x22, #4, MUL VL]\n"
+ "st1w { z25.s }, p1, [x22, #5, MUL VL]\n"
"42:" // Height 3: Writeback done
"decw x14, ALL, MUL #6\n"
"cmp x14, XZR\n"
@@ -1014,21 +1012,20 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 58f\n"
"43:" // Height 4
"ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
- "mov x21, #0x10\n"
"ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x21, #0x10\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "madd x21, x20, x21, x13\n"
- "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x13, %x[output_ptr]\n"
+ "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x21, ALL, MUL #5\n"
"add x11, x12, x20, LSL #1\n"
"add x10, x11, x20, LSL #1\n"
"add x9, x10, x20, LSL #1\n"
+ "cntw x21, ALL, MUL #5\n"
"add x28, x9, x20, LSL #1\n"
"add x27, x28, x20, LSL #1\n"
"add x20, x27, x20, LSL #1\n"
@@ -1036,20 +1033,20 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"bgt 45f\n"
"decw x21\n"
- "mov x27, x12\n"
"cmp x14, x21\n"
+ "mov x27, x12\n"
"bgt 45f\n"
"decw x21\n"
- "mov x28, x12\n"
"cmp x14, x21\n"
+ "mov x28, x12\n"
"bgt 45f\n"
"decw x21\n"
- "mov x9, x12\n"
"cmp x14, x21\n"
+ "mov x9, x12\n"
"bgt 45f\n"
"decw x21\n"
- "mov x10, x12\n"
"cmp x14, x21\n"
+ "mov x10, x12\n"
"bgt 45f\n"
"mov x11, x12\n"
"45:" // Height 4: B setup done
@@ -1068,19 +1065,19 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x15, 46f\n"
"ld1w { z8.s }, p7/Z, [x15]\n"
"ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x15, x15, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -1101,51 +1098,51 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z22.s }, p6/Z, [x13]\n"
- "ld1w { z24.s }, p5/Z, [x13, #1, MUL VL]\n"
- "ld1w { z26.s }, p4/Z, [x13, #2, MUL VL]\n"
- "ld1w { z27.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [x13, #4, MUL VL]\n"
"add x22, x13, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x13]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
"ld1w { z14.s }, p6/Z, [x22]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
"ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
"ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
"ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
"ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
"ld1w { z21.s }, p6/Z, [x21]\n"
- "zip1 z8.d, z22.d, z14.d\n"
- "zip2 z14.d, z22.d, z14.d\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
"ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
"ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
"ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
"ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
- "zip1 z10.d, z26.d, z16.d\n"
- "zip2 z16.d, z26.d, z16.d\n"
- "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
- "ld1w { z26.s }, p6/Z, [x20]\n"
- "zip1 z11.d, z27.d, z17.d\n"
- "zip2 z17.d, z27.d, z17.d\n"
- "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
- "zip1 z12.d, z29.d, z18.d\n"
- "zip2 z18.d, z29.d, z18.d\n"
- "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
- "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z26.s }, p6/Z, [x20]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
+ "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
"zip1 z21.d, z22.d, z27.d\n"
"zip2 z27.d, z22.d, z27.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z22.d, z23.d, z28.d\n"
"zip2 z28.d, z23.d, z28.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z23.d, z24.d, z29.d\n"
"zip2 z29.d, z24.d, z29.d\n"
"zip1 z24.d, z25.d, z30.d\n"
@@ -1182,8 +1179,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1208,144 +1205,144 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x12]\n"
- "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z7.h }, p7/Z, [x11]\n"
- "ld1h { z6.h }, p7/Z, [x11, #1, MUL VL]\n"
- "cmp x25, #0x4\n"
- "addvl x12, x12, #2\n"
- "addvl x11, x11, #2\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z0.s }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
"uzp1 z4.h, z4.h, z4.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z5.d, z5.d, z1.d\n"
- "trn1 z4.d, z4.d, z0.d\n"
- ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
+ "sub x25, x25, #0x4\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x10]\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
- ".inst 0x6467e4a9 // bfmmla z9.s, z5.h, z7.h\n"
- ".inst 0x6467e495 // bfmmla z21.s, z4.h, z7.h\n"
- ".inst 0x6466e4af // bfmmla z15.s, z5.h, z6.h\n"
+ "cmp x25, #0x4\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x9]\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
- "addvl x9, x9, #2\n"
- ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
- ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28]\n"
- ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
- ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
- ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x27]\n"
- ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "addvl x12, x12, #2\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ "addvl x28, x28, #2\n"
"addvl x27, x27, #2\n"
- ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
- ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
- ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
- ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
- ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
- ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
- ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x12]\n"
- "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
- "ld1h { z7.h }, p7/Z, [x11]\n"
- "ld1h { z6.h }, p7/Z, [x11, #1, MUL VL]\n"
- "addvl x11, x11, #2\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z0.s }, p0/Z, [x21]\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
"uzp1 z4.h, z4.h, z4.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z5.d, z5.d, z1.d\n"
- "trn1 z4.d, z4.d, z0.d\n"
- ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
+ "addvl x12, x12, #2\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x10]\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
- ".inst 0x6467e4a9 // bfmmla z9.s, z5.h, z7.h\n"
- ".inst 0x6467e495 // bfmmla z21.s, z4.h, z7.h\n"
- ".inst 0x6466e4af // bfmmla z15.s, z5.h, z6.h\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x9]\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
+ "addvl x10, x10, #2\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
"addvl x9, x9, #2\n"
- ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
- ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28]\n"
- ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
- ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
"addvl x28, x28, #2\n"
- ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
- ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x27]\n"
- ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x27, x27, #2\n"
- ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
- ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
- ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
- ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
- ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
- ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
- ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"54:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x13, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 z4.d, z8.d, z14.d\n"
"uzp2 z8.d, z8.d, z14.d\n"
"uzp1 z14.d, z9.d, z15.d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z15.d\n"
"uzp1 z15.d, z10.d, z16.d\n"
"uzp2 z10.d, z10.d, z16.d\n"
- "add x24, x13, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z16.d, z11.d, z17.d\n"
"uzp2 z11.d, z11.d, z17.d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z17.d, z12.d, z18.d\n"
"uzp2 z12.d, z12.d, z18.d\n"
"uzp1 z18.d, z13.d, z19.d\n"
@@ -1363,9 +1360,9 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z30.d, z25.d, z31.d\n"
"uzp2 z25.d, z25.d, z31.d\n"
"tbz %x[flags], #1, 55f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p7/Z, [x21]\n"
"ld1rw { z0.s }, p7/Z, [x20]\n"
"fmin z4.s, p7/M, z4.s, z1.s\n"
"fmin z14.s, p7/M, z14.s, z1.s\n"
@@ -1423,24 +1420,24 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"st1w { z17.s }, p2, [x13, #4, MUL VL]\n"
"st1w { z18.s }, p1, [x13, #5, MUL VL]\n"
"addvl x13, x13, #6\n"
- "st1w { z8.s }, p6, [x24]\n"
- "st1w { z9.s }, p5, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p4, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p3, [x24, #3, MUL VL]\n"
- "st1w { z12.s }, p2, [x24, #4, MUL VL]\n"
- "st1w { z13.s }, p1, [x24, #5, MUL VL]\n"
- "st1w { z19.s }, p6, [x23]\n"
- "st1w { z26.s }, p5, [x23, #1, MUL VL]\n"
- "st1w { z27.s }, p4, [x23, #2, MUL VL]\n"
- "st1w { z28.s }, p3, [x23, #3, MUL VL]\n"
- "st1w { z29.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z30.s }, p1, [x23, #5, MUL VL]\n"
- "st1w { z20.s }, p6, [x22]\n"
- "st1w { z21.s }, p5, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p4, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p3, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p2, [x22, #4, MUL VL]\n"
- "st1w { z25.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z8.s }, p6, [x23]\n"
+ "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+ "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+ "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+ "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+ "st1w { z19.s }, p6, [x22]\n"
+ "st1w { z26.s }, p5, [x22, #1, MUL VL]\n"
+ "st1w { z27.s }, p4, [x22, #2, MUL VL]\n"
+ "st1w { z28.s }, p3, [x22, #3, MUL VL]\n"
+ "st1w { z29.s }, p2, [x22, #4, MUL VL]\n"
+ "st1w { z30.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z20.s }, p6, [x21]\n"
+ "st1w { z21.s }, p5, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p4, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p3, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p2, [x21, #4, MUL VL]\n"
+ "st1w { z25.s }, p1, [x21, #5, MUL VL]\n"
"56:" // Height 4: Writeback done
"decw x14, ALL, MUL #6\n"
"cmp x14, XZR\n"
@@ -1457,8 +1454,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL.hpp
deleted file mode 100644
index 70ed2ee4c7..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../std_transforms_sve.hpp"
-#include "../bfloat.hpp"
-#include "../kernel_weight_format.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const bfloat16 *, const bfloat16 *, size_t, \
- float *, int, size_t, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void sve_ffinterleaved_bf16fp32_dot_8x3VL( ARGLIST );
-
-class cls_sve_ffinterleaved_bf16fp32_dot_8x3VL
-{
-public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 3;
- }
- static unsigned int stripe_width()
- {
- return get_vector_length<float>();
- }
-
- static KernelWeightFormat kernel_weight_format()
- {
- return KernelWeightFormat::VL1VL_BL32;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 2;
- }
-
-
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 2, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 2, 1, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, bfloat16>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 20.92, 7.74, 4.14 };
- }
- }
-
-
- if (std::is_same<T, float>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 20.92, 5.18, 4.37 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=sve_ffinterleaved_bf16fp32_dot_8x3VL;
- cls_sve_ffinterleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL/generic.cpp
deleted file mode 100644
index 26192718b5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include <cstddef>
-#include "../../bfloat.hpp"
-
-namespace arm_gemm {
-
-void sve_ffinterleaved_bf16fp32_dot_8x3VL(
- const bfloat16 *Apanel,
- const bfloat16 *Bpanel,
- size_t B_stride,
- float *Cpanel,
- int ablocks,
- size_t N,
- int K) {
-
- struct KernelArgs {
- size_t K = {};
- const bfloat16 *Bpanel = {};
- size_t N = {};
- size_t B_stride = {};
- const bfloat16 *cur_B_ptr = {};
- } ka;
-
- ka.K = (K/2) - 1;
- ka.Bpanel = Bpanel;
- ka.N = N;
- ka.B_stride = B_stride;
-
- __asm__ __volatile__(
- "ptrue p0.b\n"
- "1:" // Height loop
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x25, %x[Apanel]\n"
- "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "2:" // Width loop
- "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "cntw x23, ALL, MUL #2\n"
- "mov %x[Apanel], x25\n"
- "add x22, x24, x20, LSL #1\n"
- "cmp x26, x23\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
- "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "bgt 3f\n"
- "decw x23\n"
- "mov x21, x24\n"
- "cmp x26, x23\n"
- "bgt 3f\n"
- "mov x22, x24\n"
- "3:" // B setup done
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
- "mov z8.b, #0x0\n"
- "mov z9.b, #0x0\n"
- "mov z10.b, #0x0\n"
- "mov z11.b, #0x0\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "mov z12.b, #0x0\n"
- "mov z13.b, #0x0\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
- "cmp x20, #0x2\n"
- "mov z14.b, #0x0\n"
- "mov z15.b, #0x0\n"
- "mov z16.b, #0x0\n"
- "mov z17.b, #0x0\n"
- "ld1h { z4.h }, p0/Z, [x24]\n"
- "mov z18.b, #0x0\n"
- "mov z19.b, #0x0\n"
- "ld1h { z5.h }, p0/Z, [x22]\n"
- "mov z20.b, #0x0\n"
- "mov z21.b, #0x0\n"
- "ld1h { z6.h }, p0/Z, [x21]\n"
- "mov z22.b, #0x0\n"
- "mov z23.b, #0x0\n"
- "mov z24.b, #0x0\n"
- "mov z25.b, #0x0\n"
- "mov z26.b, #0x0\n"
- "mov z27.b, #0x0\n"
- "mov z28.b, #0x0\n"
- "mov z29.b, #0x0\n"
- "mov z30.b, #0x0\n"
- "mov z31.b, #0x0\n"
- "blt 5f\n"
- "4:" // main loop head
- ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
- ".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n"
- ".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n"
- "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
- ".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n"
- "sub x20, x20, #0x2\n"
- ".inst 0x6471409a // bfdot z26.s, z4.h, z1.h[2]\n"
- ".inst 0x6479409d // bfdot z29.s, z4.h, z1.h[3]\n"
- "ld1h { z4.h }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0x646040a9 // bfdot z9.s, z5.h, z0.h[0]\n"
- ".inst 0x646840ac // bfdot z12.s, z5.h, z0.h[1]\n"
- "addvl x24, x24, #2\n"
- ".inst 0x647040af // bfdot z15.s, z5.h, z0.h[2]\n"
- ".inst 0x647840b2 // bfdot z18.s, z5.h, z0.h[3]\n"
- "cmp x20, #0x2\n"
- ".inst 0x646140b5 // bfdot z21.s, z5.h, z1.h[0]\n"
- ".inst 0x646940b8 // bfdot z24.s, z5.h, z1.h[1]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x647140bb // bfdot z27.s, z5.h, z1.h[2]\n"
- ".inst 0x647940be // bfdot z30.s, z5.h, z1.h[3]\n"
- "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646840cd // bfdot z13.s, z6.h, z0.h[1]\n"
- "addvl x22, x22, #2\n"
- ".inst 0x647040d0 // bfdot z16.s, z6.h, z0.h[2]\n"
- ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x646140d6 // bfdot z22.s, z6.h, z1.h[0]\n"
- ".inst 0x646940d9 // bfdot z25.s, z6.h, z1.h[1]\n"
- ".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n"
- ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
- "ld1h { z2.h }, p0/Z, [x21, #1, MUL VL]\n"
- "addvl x21, x21, #2\n"
- ".inst 0x64634088 // bfdot z8.s, z4.h, z3.h[0]\n"
- ".inst 0x646b408b // bfdot z11.s, z4.h, z3.h[1]\n"
- ".inst 0x6473408e // bfdot z14.s, z4.h, z3.h[2]\n"
- ".inst 0x647b4091 // bfdot z17.s, z4.h, z3.h[3]\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x64674094 // bfdot z20.s, z4.h, z7.h[0]\n"
- ".inst 0x646f4097 // bfdot z23.s, z4.h, z7.h[1]\n"
- ".inst 0x6477409a // bfdot z26.s, z4.h, z7.h[2]\n"
- ".inst 0x647f409d // bfdot z29.s, z4.h, z7.h[3]\n"
- "ld1h { z4.h }, p0/Z, [x24]\n"
- ".inst 0x646340a9 // bfdot z9.s, z5.h, z3.h[0]\n"
- ".inst 0x646b40ac // bfdot z12.s, z5.h, z3.h[1]\n"
- ".inst 0x647340af // bfdot z15.s, z5.h, z3.h[2]\n"
- ".inst 0x647b40b2 // bfdot z18.s, z5.h, z3.h[3]\n"
- ".inst 0x646740b5 // bfdot z21.s, z5.h, z7.h[0]\n"
- ".inst 0x646f40b8 // bfdot z24.s, z5.h, z7.h[1]\n"
- ".inst 0x647740bb // bfdot z27.s, z5.h, z7.h[2]\n"
- ".inst 0x647f40be // bfdot z30.s, z5.h, z7.h[3]\n"
- "ld1h { z5.h }, p0/Z, [x22]\n"
- ".inst 0x6463404a // bfdot z10.s, z2.h, z3.h[0]\n"
- ".inst 0x646b404d // bfdot z13.s, z2.h, z3.h[1]\n"
- ".inst 0x64734050 // bfdot z16.s, z2.h, z3.h[2]\n"
- ".inst 0x647b4053 // bfdot z19.s, z2.h, z3.h[3]\n"
- ".inst 0x64674056 // bfdot z22.s, z2.h, z7.h[0]\n"
- ".inst 0x646f4059 // bfdot z25.s, z2.h, z7.h[1]\n"
- ".inst 0x6477405c // bfdot z28.s, z2.h, z7.h[2]\n"
- ".inst 0x647f405f // bfdot z31.s, z2.h, z7.h[3]\n"
- "ld1h { z6.h }, p0/Z, [x21]\n"
- "bge 4b\n"
- "5:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "addvl x24, x24, #1\n"
- ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
- "addvl x22, x22, #1\n"
- "addvl x21, x21, #1\n"
- ".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n"
- ".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n"
- ".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n"
- ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
- ".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n"
- ".inst 0x6471409a // bfdot z26.s, z4.h, z1.h[2]\n"
- ".inst 0x6479409d // bfdot z29.s, z4.h, z1.h[3]\n"
- ".inst 0x646040a9 // bfdot z9.s, z5.h, z0.h[0]\n"
- ".inst 0x646840ac // bfdot z12.s, z5.h, z0.h[1]\n"
- ".inst 0x647040af // bfdot z15.s, z5.h, z0.h[2]\n"
- ".inst 0x647840b2 // bfdot z18.s, z5.h, z0.h[3]\n"
- ".inst 0x646140b5 // bfdot z21.s, z5.h, z1.h[0]\n"
- ".inst 0x646940b8 // bfdot z24.s, z5.h, z1.h[1]\n"
- ".inst 0x647140bb // bfdot z27.s, z5.h, z1.h[2]\n"
- ".inst 0x647940be // bfdot z30.s, z5.h, z1.h[3]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646840cd // bfdot z13.s, z6.h, z0.h[1]\n"
- ".inst 0x647040d0 // bfdot z16.s, z6.h, z0.h[2]\n"
- ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
- ".inst 0x646140d6 // bfdot z22.s, z6.h, z1.h[0]\n"
- ".inst 0x646940d9 // bfdot z25.s, z6.h, z1.h[1]\n"
- ".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n"
- ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
- "cbz x20, 6f\n"
- "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1h { z2.h }, p0/Z, [x24]\n"
- "ld1h { z1.h }, p0/Z, [x22]\n"
- "ld1h { z0.h }, p0/Z, [x21]\n"
- ".inst 0x64644048 // bfdot z8.s, z2.h, z4.h[0]\n"
- ".inst 0x646c404b // bfdot z11.s, z2.h, z4.h[1]\n"
- ".inst 0x6474404e // bfdot z14.s, z2.h, z4.h[2]\n"
- ".inst 0x647c4051 // bfdot z17.s, z2.h, z4.h[3]\n"
- ".inst 0x64634054 // bfdot z20.s, z2.h, z3.h[0]\n"
- ".inst 0x646b4057 // bfdot z23.s, z2.h, z3.h[1]\n"
- ".inst 0x6473405a // bfdot z26.s, z2.h, z3.h[2]\n"
- ".inst 0x647b405d // bfdot z29.s, z2.h, z3.h[3]\n"
- ".inst 0x64644029 // bfdot z9.s, z1.h, z4.h[0]\n"
- ".inst 0x646c402c // bfdot z12.s, z1.h, z4.h[1]\n"
- ".inst 0x6474402f // bfdot z15.s, z1.h, z4.h[2]\n"
- ".inst 0x647c4032 // bfdot z18.s, z1.h, z4.h[3]\n"
- ".inst 0x64634035 // bfdot z21.s, z1.h, z3.h[0]\n"
- ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n"
- ".inst 0x6473403b // bfdot z27.s, z1.h, z3.h[2]\n"
- ".inst 0x647b403e // bfdot z30.s, z1.h, z3.h[3]\n"
- ".inst 0x6464400a // bfdot z10.s, z0.h, z4.h[0]\n"
- ".inst 0x646c400d // bfdot z13.s, z0.h, z4.h[1]\n"
- ".inst 0x64744010 // bfdot z16.s, z0.h, z4.h[2]\n"
- ".inst 0x647c4013 // bfdot z19.s, z0.h, z4.h[3]\n"
- ".inst 0x64634016 // bfdot z22.s, z0.h, z3.h[0]\n"
- ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n"
- ".inst 0x6473401c // bfdot z28.s, z0.h, z3.h[2]\n"
- ".inst 0x647b401f // bfdot z31.s, z0.h, z3.h[3]\n"
- "6:" // multiply loop done
- "decw x26, ALL, MUL #3\n"
- "st1w { z8.s }, p0, [%x[Cpanel]]\n"
- "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "cmp x26, XZR\n"
- "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #16\n"
- "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
- "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
- "st1w { z24.s }, p0, [%x[Cpanel]]\n"
- "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #8\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
index 8695a9b53c..1fe5f48da6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
@@ -41,8 +41,7 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL( ARGLIST );
class cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -73,8 +72,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 4, 2> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
@@ -89,8 +88,10 @@ public:
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
- default:
- return { 39.66, 5.18, 4.37 };
+ case CPUModel::V1:
+ return { 53.48, 4.23, 6.53 };
+ default:
+ return { 29.07, 2.76, 5.39 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 69adb67a6a..09bc24051a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -55,52 +55,52 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x25, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
"ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cntw x23, ALL, MUL #2\n"
- "mov %x[Apanel], x25\n"
"add x22, x24, x20, LSL #1\n"
- "cmp x26, x23\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"decw x23\n"
- "mov x21, x24\n"
"cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
"mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "mov z11.b, #0x0\n"
"ld1h { z4.h }, p0/Z, [x24]\n"
+ "mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
- "mov z13.b, #0x0\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "cmp x20, #0x2\n"
+ "mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
+ "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
+ "ld1h { z5.h }, p0/Z, [x24, #1, MUL VL]\n"
"mov z17.b, #0x0\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z18.b, #0x0\n"
+ "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
"mov z19.b, #0x0\n"
- "ld1h { z5.h }, p0/Z, [x24, #1, MUL VL]\n"
"mov z20.b, #0x0\n"
+ "addvl x24, x24, #2\n"
"mov z21.b, #0x0\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
"mov z22.b, #0x0\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z23.b, #0x0\n"
- "addvl x24, x24, #2\n"
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
"mov z28.b, #0x0\n"
@@ -114,78 +114,78 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
- "ld1h { z3.h }, p0/Z, [x22]\n"
+ "ld1h { z7.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
".inst 0x6464e4da // bfmmla z26.s, z6.h, z4.h\n"
".inst 0x6465e4dd // bfmmla z29.s, z6.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x21]\n"
- "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n"
- ".inst 0x6463e409 // bfmmla z9.s, z0.h, z3.h\n"
+ "ld1h { z5.h }, p0/Z, [x21]\n"
+ "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
+ ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
+ ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
"sub x20, x20, #0x2\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- ".inst 0x6463e42f // bfmmla z15.s, z1.h, z3.h\n"
+ ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
"cmp x20, #0x2\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
- ".inst 0x6463e455 // bfmmla z21.s, z2.h, z3.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
- ".inst 0x6463e4db // bfmmla z27.s, z6.h, z3.h\n"
+ ".inst 0x6467e4db // bfmmla z27.s, z6.h, z7.h\n"
+ ".inst 0x6463e4de // bfmmla z30.s, z6.h, z3.h\n"
"ld1h { z3.h }, p0/Z, [x24]\n"
- ".inst 0x6467e4de // bfmmla z30.s, z6.h, z7.h\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
- ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
+ ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
+ ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
+ ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
+ ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
- ".inst 0x6464e4dc // bfmmla z28.s, z6.h, z4.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x6465e4df // bfmmla z31.s, z6.h, z5.h\n"
+ ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
+ ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
+ "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x6465e4dc // bfmmla z28.s, z6.h, z5.h\n"
+ ".inst 0x6464e4df // bfmmla z31.s, z6.h, z4.h\n"
+ "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
"ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
".inst 0x6463e408 // bfmmla z8.s, z0.h, z3.h\n"
".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
".inst 0x6463e42e // bfmmla z14.s, z1.h, z3.h\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
- ".inst 0x6463e454 // bfmmla z20.s, z2.h, z3.h\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
+ "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b7 // bfmmla z23.s, z5.h, z7.h\n"
+ "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x6463e4da // bfmmla z26.s, z6.h, z3.h\n"
- "ld1h { z3.h }, p0/Z, [x21, #2, MUL VL]\n"
".inst 0x6467e4dd // bfmmla z29.s, z6.h, z7.h\n"
+ "ld1h { z3.h }, p0/Z, [x21, #2, MUL VL]\n"
"ld1h { z7.h }, p0/Z, [x21, #3, MUL VL]\n"
- ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n"
- ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n"
- ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n"
+ ".inst 0x6462e409 // bfmmla z9.s, z0.h, z2.h\n"
+ ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6462e42f // bfmmla z15.s, z1.h, z2.h\n"
+ ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
"addvl x22, x22, #4\n"
- ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n"
- ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n"
+ ".inst 0x6462e4b5 // bfmmla z21.s, z5.h, z2.h\n"
+ ".inst 0x6464e4b8 // bfmmla z24.s, z5.h, z4.h\n"
"addvl x21, x21, #4\n"
- ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n"
- ".inst 0x6464e4db // bfmmla z27.s, z6.h, z4.h\n"
+ ".inst 0x6462e4db // bfmmla z27.s, z6.h, z2.h\n"
+ ".inst 0x6464e4de // bfmmla z30.s, z6.h, z4.h\n"
"ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0x6465e4de // bfmmla z30.s, z6.h, z5.h\n"
".inst 0x6463e40a // bfmmla z10.s, z0.h, z3.h\n"
- "ld1h { z5.h }, p0/Z, [x24, #3, MUL VL]\n"
".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
+ ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n"
".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
- ".inst 0x6463e456 // bfmmla z22.s, z2.h, z3.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
+ "ld1h { z5.h }, p0/Z, [x24, #3, MUL VL]\n"
".inst 0x6463e4dc // bfmmla z28.s, z6.h, z3.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
".inst 0x6467e4df // bfmmla z31.s, z6.h, z7.h\n"
+ "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x24, x24, #4\n"
"bge 4b\n"
"5:" // main loop skip
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
@@ -193,52 +193,52 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
"ld1h { z6.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6464e4fa // bfmmla z26.s, z7.h, z4.h\n"
+ ".inst 0x6465e4fd // bfmmla z29.s, z7.h, z5.h\n"
"ld1h { z5.h }, p0/Z, [x21]\n"
"ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- "addvl x22, x22, #2\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
+ "addvl x22, x22, #2\n"
+ ".inst 0x6466e4fb // bfmmla z27.s, z7.h, z6.h\n"
+ ".inst 0x6463e4fe // bfmmla z30.s, z7.h, z3.h\n"
"addvl x21, x21, #2\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
- ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n"
- ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n"
".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
- ".inst 0x6465e47c // bfmmla z28.s, z3.h, z5.h\n"
- ".inst 0x6464e47f // bfmmla z31.s, z3.h, z4.h\n"
+ ".inst 0x6465e4fc // bfmmla z28.s, z7.h, z5.h\n"
+ ".inst 0x6464e4ff // bfmmla z31.s, z7.h, z4.h\n"
"cbz x20, 6f\n"
"ld1h { z1.h }, p0/Z, [x24]\n"
"ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
"ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
"ld1h { z0.h }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n"
"ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
"ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
- "ld1h { z3.h }, p0/Z, [x22]\n"
- "ld1h { z2.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n"
".inst 0x6461e4ce // bfmmla z14.s, z6.h, z1.h\n"
".inst 0x6460e4d1 // bfmmla z17.s, z6.h, z0.h\n"
".inst 0x6461e4b4 // bfmmla z20.s, z5.h, z1.h\n"
+ "ld1h { z3.h }, p0/Z, [x22]\n"
".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
- "ld1h { z1.h }, p0/Z, [x21]\n"
+ "ld1h { z2.h }, p0/Z, [x22, #1, MUL VL]\n"
".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z1.h }, p0/Z, [x21]\n"
"ld1h { z0.h }, p0/Z, [x21, #1, MUL VL]\n"
".inst 0x6463e4e9 // bfmmla z9.s, z7.h, z3.h\n"
".inst 0x6462e4ec // bfmmla z12.s, z7.h, z2.h\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n"
".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
".inst 0x6463e4b5 // bfmmla z21.s, z5.h, z3.h\n"
@@ -255,53 +255,53 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"6:" // multiply loop done
"decw x26, ALL, MUL #3\n"
- "uzp1 z2.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
"uzp1 z1.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
"uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z3.d, z14.d, z17.d\n"
- "uzp2 z14.d, z14.d, z17.d\n"
"st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
- "cmp x26, XZR\n"
"st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "uzp2 z15.d, z15.d, z18.d\n"
- "uzp1 z2.d, z16.d, z19.d\n"
+ "uzp1 z2.d, z14.d, z17.d\n"
+ "uzp2 z14.d, z14.d, z17.d\n"
"st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp2 z16.d, z16.d, z19.d\n"
- "uzp1 z1.d, z20.d, z23.d\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
+ "cmp x26, XZR\n"
"st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp2 z20.d, z20.d, z23.d\n"
- "uzp1 z0.d, z21.d, z24.d\n"
+ "uzp2 z15.d, z15.d, z18.d\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
"st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+ "uzp2 z16.d, z16.d, z19.d\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z2.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+ "uzp2 z20.d, z20.d, z23.d\n"
+ "uzp1 z23.d, z21.d, z24.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z21.d, z21.d, z24.d\n"
- "uzp1 z23.d, z22.d, z25.d\n"
- "st1w { z3.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
- "uzp1 z19.d, z26.d, z29.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #16\n"
- "uzp2 z26.d, z26.d, z29.d\n"
- "uzp1 z18.d, z27.d, z30.d\n"
- "uzp2 z27.d, z27.d, z30.d\n"
- "uzp1 z17.d, z28.d, z31.d\n"
- "uzp2 z28.d, z28.d, z31.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
"st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
+ "uzp2 z26.d, z26.d, z29.d\n"
"st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
+ "uzp2 z27.d, z27.d, z30.d\n"
"st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z23.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
+ "uzp2 z28.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
index 22cd8be2b0..13ad2404e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
@@ -41,8 +41,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( ARGLIST );
class cls_sve_ffinterleaved_fp16_mla_8x3VL
{
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)( ARGLIST );
@@ -73,8 +72,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
index 6749fc720a..0389fb043a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -54,52 +54,52 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x25, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
"ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cnth x23, ALL, MUL #2\n"
- "mov %x[Apanel], x25\n"
"add x22, x24, x20, LSL #1\n"
- "cmp x26, x23\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"dech x23\n"
- "mov x21, x24\n"
"cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
"mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "mov z11.b, #0x0\n"
"ld1h { z0.h }, p0/Z, [x24]\n"
+ "mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
- "mov z13.b, #0x0\n"
"ld1h { z1.h }, p0/Z, [x22]\n"
- "cmp x20, #0x2\n"
+ "mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
+ "ld1h { z2.h }, p0/Z, [x21]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
"mov z17.b, #0x0\n"
- "ld1h { z2.h }, p0/Z, [x21]\n"
"mov z18.b, #0x0\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
"mov z19.b, #0x0\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
"mov z20.b, #0x0\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
"mov z21.b, #0x0\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
"mov z22.b, #0x0\n"
+ "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
"mov z23.b, #0x0\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
"mov z28.b, #0x0\n"
@@ -116,12 +116,12 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z11.h, p0/M, z0.h, z4.h\n"
"fmla z12.h, p0/M, z1.h, z4.h\n"
"fmla z13.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #10]\n"
"fmla z14.h, p0/M, z0.h, z5.h\n"
"fmla z15.h, p0/M, z1.h, z5.h\n"
"cmp x20, #0x2\n"
"fmla z16.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z7.h }, p0/Z, [%x[Apanel], #12]\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #12]\n"
"fmla z17.h, p0/M, z0.h, z6.h\n"
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
@@ -129,60 +129,60 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z20.h, p0/M, z0.h, z3.h\n"
"fmla z21.h, p0/M, z1.h, z3.h\n"
"fmla z22.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #16]\n"
- "fmla z23.h, p0/M, z0.h, z4.h\n"
- "fmla z24.h, p0/M, z1.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
- "fmla z26.h, p0/M, z0.h, z7.h\n"
- "fmla z27.h, p0/M, z1.h, z7.h\n"
- "fmla z28.h, p0/M, z2.h, z7.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
+ "fmla z23.h, p0/M, z0.h, z7.h\n"
+ "fmla z24.h, p0/M, z1.h, z7.h\n"
+ "fmla z25.h, p0/M, z2.h, z7.h\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #18]\n"
+ "fmla z26.h, p0/M, z0.h, z4.h\n"
+ "fmla z27.h, p0/M, z1.h, z4.h\n"
+ "fmla z28.h, p0/M, z2.h, z4.h\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #20]\n"
"fmla z29.h, p0/M, z0.h, z6.h\n"
"ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
"fmla z30.h, p0/M, z1.h, z6.h\n"
"fmla z31.h, p0/M, z2.h, z6.h\n"
"ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
"ld1h { z2.h }, p0/Z, [x21, #1, MUL VL]\n"
- "addvl x24, x24, #2\n"
+ "fmla z8.h, p0/M, z7.h, z3.h\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
- "addvl x22, x22, #2\n"
- "addvl x21, x21, #2\n"
- "fmla z8.h, p0/M, z7.h, z5.h\n"
- "fmla z11.h, p0/M, z7.h, z4.h\n"
- "fmla z9.h, p0/M, z6.h, z5.h\n"
- "fmla z12.h, p0/M, z6.h, z4.h\n"
- "fmla z10.h, p0/M, z2.h, z5.h\n"
- "fmla z13.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #24]\n"
- "fmla z14.h, p0/M, z7.h, z3.h\n"
- "fmla z15.h, p0/M, z6.h, z3.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
- "fmla z16.h, p0/M, z2.h, z3.h\n"
- "fmla z17.h, p0/M, z7.h, z1.h\n"
+ "fmla z9.h, p0/M, z6.h, z3.h\n"
+ "fmla z10.h, p0/M, z2.h, z3.h\n"
+ "fmla z11.h, p0/M, z7.h, z5.h\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
+ "fmla z12.h, p0/M, z6.h, z5.h\n"
+ "fmla z13.h, p0/M, z2.h, z5.h\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #26]\n"
+ "fmla z14.h, p0/M, z7.h, z4.h\n"
+ "fmla z15.h, p0/M, z6.h, z4.h\n"
+ "addvl x24, x24, #2\n"
+ "fmla z16.h, p0/M, z2.h, z4.h\n"
"ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z17.h, p0/M, z7.h, z1.h\n"
"fmla z18.h, p0/M, z6.h, z1.h\n"
"fmla z19.h, p0/M, z2.h, z1.h\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
+ "addvl x22, x22, #2\n"
+ "addvl x21, x21, #2\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z20.h, p0/M, z7.h, z5.h\n"
- "fmla z21.h, p0/M, z6.h, z5.h\n"
- "fmla z22.h, p0/M, z2.h, z5.h\n"
- "fmla z23.h, p0/M, z7.h, z4.h\n"
+ "fmla z20.h, p0/M, z7.h, z3.h\n"
+ "fmla z21.h, p0/M, z6.h, z3.h\n"
+ "fmla z22.h, p0/M, z2.h, z3.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z24.h, p0/M, z6.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+ "fmla z23.h, p0/M, z7.h, z5.h\n"
+ "fmla z24.h, p0/M, z6.h, z5.h\n"
+ "fmla z25.h, p0/M, z2.h, z5.h\n"
"fmla z26.h, p0/M, z7.h, z0.h\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
"fmla z27.h, p0/M, z6.h, z0.h\n"
"fmla z28.h, p0/M, z2.h, z0.h\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
"fmla z29.h, p0/M, z7.h, z1.h\n"
"ld1h { z0.h }, p0/Z, [x24]\n"
"fmla z30.h, p0/M, z6.h, z1.h\n"
"fmla z31.h, p0/M, z2.h, z1.h\n"
"ld1h { z1.h }, p0/Z, [x22]\n"
"ld1h { z2.h }, p0/Z, [x21]\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
"ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
"bge 4b\n"
"5:" // main loop skip
@@ -204,12 +204,12 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
- "addvl x21, x21, #1\n"
"fmla z20.h, p0/M, z0.h, z7.h\n"
"fmla z21.h, p0/M, z1.h, z7.h\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
+ "addvl x21, x21, #1\n"
"fmla z22.h, p0/M, z2.h, z7.h\n"
"fmla z23.h, p0/M, z0.h, z4.h\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z24.h, p0/M, z1.h, z4.h\n"
"fmla z25.h, p0/M, z2.h, z4.h\n"
"fmla z26.h, p0/M, z0.h, z5.h\n"
@@ -223,19 +223,19 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"ld1h { z5.h }, p0/Z, [x22]\n"
"ld1h { z4.h }, p0/Z, [x21]\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "fmla z8.h, p0/M, z6.h, z3.h\n"
"ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
- "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
- "fmla z8.h, p0/M, z6.h, z3.h\n"
"fmla z9.h, p0/M, z5.h, z3.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
"fmla z10.h, p0/M, z4.h, z3.h\n"
"fmla z11.h, p0/M, z6.h, z2.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z12.h, p0/M, z5.h, z2.h\n"
"fmla z13.h, p0/M, z4.h, z2.h\n"
- "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z14.h, p0/M, z6.h, z1.h\n"
"fmla z15.h, p0/M, z5.h, z1.h\n"
+ "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
"fmla z16.h, p0/M, z4.h, z1.h\n"
"fmla z17.h, p0/M, z6.h, z0.h\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
@@ -258,10 +258,10 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"6:" // multiply loop done
"dech x26, ALL, MUL #3\n"
"st1h { z8.h }, p0, [%x[Cpanel]]\n"
+ "cmp x26, XZR\n"
"st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "cmp x26, XZR\n"
"st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
index 204bfdd658..bc23dc28b0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
@@ -54,43 +54,43 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x25, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
"ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cnth x23, ALL, MUL #2\n"
- "mov %x[Apanel], x25\n"
"add x22, x24, x20, LSL #1\n"
- "cmp x26, x23\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"dech x23\n"
- "mov x21, x24\n"
"cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
"mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "mov z11.b, #0x0\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+ "mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
- "mov z13.b, #0x0\n"
"ld1h { z2.h }, p0/Z, [x24]\n"
- "cmp x20, #0x2\n"
+ "mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
+ "ld1h { z3.h }, p0/Z, [x22]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
+ "ld1h { z4.h }, p0/Z, [x21]\n"
"mov z17.b, #0x0\n"
- "ld1h { z3.h }, p0/Z, [x22]\n"
"mov z18.b, #0x0\n"
"mov z19.b, #0x0\n"
- "ld1h { z4.h }, p0/Z, [x21]\n"
"mov z20.b, #0x0\n"
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
@@ -169,18 +169,18 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"fmla z31.h, z1.h, z7.h[7]\n"
"bge 4b\n"
"5:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "addvl x24, x24, #1\n"
"fmla z8.h, z2.h, z0.h[0]\n"
- "addvl x22, x22, #1\n"
- "addvl x21, x21, #1\n"
"fmla z11.h, z2.h, z0.h[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z14.h, z2.h, z0.h[2]\n"
"fmla z17.h, z2.h, z0.h[3]\n"
+ "addvl x24, x24, #1\n"
"fmla z20.h, z2.h, z0.h[4]\n"
"fmla z23.h, z2.h, z0.h[5]\n"
+ "addvl x22, x22, #1\n"
"fmla z26.h, z2.h, z0.h[6]\n"
"fmla z29.h, z2.h, z0.h[7]\n"
+ "addvl x21, x21, #1\n"
"fmla z9.h, z3.h, z0.h[0]\n"
"fmla z12.h, z3.h, z0.h[1]\n"
"fmla z15.h, z3.h, z0.h[2]\n"
@@ -200,13 +200,13 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"cbz x20, 6f\n"
"ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
"ld1h { z2.h }, p0/Z, [x24]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
+ "fmla z8.h, z2.h, z3.h[0]\n"
"ld1h { z1.h }, p0/Z, [x22]\n"
"ld1h { z0.h }, p0/Z, [x21]\n"
- "fmla z8.h, z2.h, z3.h[0]\n"
"fmla z11.h, z2.h, z3.h[1]\n"
"fmla z14.h, z2.h, z3.h[2]\n"
"fmla z17.h, z2.h, z3.h[3]\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z20.h, z2.h, z3.h[4]\n"
"fmla z23.h, z2.h, z3.h[5]\n"
"fmla z26.h, z2.h, z3.h[6]\n"
@@ -230,10 +230,10 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"6:" // multiply loop done
"dech x26, ALL, MUL #3\n"
"st1h { z8.h }, p0, [%x[Cpanel]]\n"
+ "cmp x26, XZR\n"
"st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "cmp x26, XZR\n"
"st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
index ad52e2a9b3..21e811497a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
@@ -41,8 +41,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( ARGLIST );
class cls_sve_ffinterleaved_fp32_mla_8x3VL
{
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -73,8 +72,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
index 6135cd4bae..d67c01a574 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -54,52 +54,52 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x25, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
"ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cntw x23, ALL, MUL #2\n"
- "mov %x[Apanel], x25\n"
"add x22, x24, x20, LSL #2\n"
- "cmp x26, x23\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"decw x23\n"
- "mov x21, x24\n"
"cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
"mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "mov z11.b, #0x0\n"
"ld1w { z0.s }, p0/Z, [x24]\n"
+ "mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
- "mov z13.b, #0x0\n"
"ld1w { z1.s }, p0/Z, [x22]\n"
- "cmp x20, #0x2\n"
+ "mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
+ "ld1w { z2.s }, p0/Z, [x21]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z17.b, #0x0\n"
- "ld1w { z2.s }, p0/Z, [x21]\n"
"mov z18.b, #0x0\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z19.b, #0x0\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z20.b, #0x0\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z21.b, #0x0\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z22.b, #0x0\n"
+ "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z23.b, #0x0\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
"mov z28.b, #0x0\n"
@@ -116,12 +116,12 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z11.s, p0/M, z0.s, z4.s\n"
"fmla z12.s, p0/M, z1.s, z4.s\n"
"fmla z13.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #20]\n"
"fmla z14.s, p0/M, z0.s, z5.s\n"
"fmla z15.s, p0/M, z1.s, z5.s\n"
"cmp x20, #0x2\n"
"fmla z16.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z7.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #24]\n"
"fmla z17.s, p0/M, z0.s, z6.s\n"
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
@@ -129,60 +129,60 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z20.s, p0/M, z0.s, z3.s\n"
"fmla z21.s, p0/M, z1.s, z3.s\n"
"fmla z22.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #32]\n"
- "fmla z23.s, p0/M, z0.s, z4.s\n"
- "fmla z24.s, p0/M, z1.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "fmla z26.s, p0/M, z0.s, z7.s\n"
- "fmla z27.s, p0/M, z1.s, z7.s\n"
- "fmla z28.s, p0/M, z2.s, z7.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+ "fmla z23.s, p0/M, z0.s, z7.s\n"
+ "fmla z24.s, p0/M, z1.s, z7.s\n"
+ "fmla z25.s, p0/M, z2.s, z7.s\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #36]\n"
+ "fmla z26.s, p0/M, z0.s, z4.s\n"
+ "fmla z27.s, p0/M, z1.s, z4.s\n"
+ "fmla z28.s, p0/M, z2.s, z4.s\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #40]\n"
"fmla z29.s, p0/M, z0.s, z6.s\n"
"ld1w { z7.s }, p0/Z, [x24, #1, MUL VL]\n"
"fmla z30.s, p0/M, z1.s, z6.s\n"
"fmla z31.s, p0/M, z2.s, z6.s\n"
"ld1w { z6.s }, p0/Z, [x22, #1, MUL VL]\n"
"ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
- "addvl x24, x24, #2\n"
+ "fmla z8.s, p0/M, z7.s, z3.s\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
- "addvl x22, x22, #2\n"
- "addvl x21, x21, #2\n"
- "fmla z8.s, p0/M, z7.s, z5.s\n"
- "fmla z11.s, p0/M, z7.s, z4.s\n"
- "fmla z9.s, p0/M, z6.s, z5.s\n"
- "fmla z12.s, p0/M, z6.s, z4.s\n"
- "fmla z10.s, p0/M, z2.s, z5.s\n"
- "fmla z13.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #48]\n"
- "fmla z14.s, p0/M, z7.s, z3.s\n"
- "fmla z15.s, p0/M, z6.s, z3.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "fmla z16.s, p0/M, z2.s, z3.s\n"
- "fmla z17.s, p0/M, z7.s, z1.s\n"
+ "fmla z9.s, p0/M, z6.s, z3.s\n"
+ "fmla z10.s, p0/M, z2.s, z3.s\n"
+ "fmla z11.s, p0/M, z7.s, z5.s\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+ "fmla z12.s, p0/M, z6.s, z5.s\n"
+ "fmla z13.s, p0/M, z2.s, z5.s\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #52]\n"
+ "fmla z14.s, p0/M, z7.s, z4.s\n"
+ "fmla z15.s, p0/M, z6.s, z4.s\n"
+ "addvl x24, x24, #2\n"
+ "fmla z16.s, p0/M, z2.s, z4.s\n"
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+ "fmla z17.s, p0/M, z7.s, z1.s\n"
"fmla z18.s, p0/M, z6.s, z1.s\n"
"fmla z19.s, p0/M, z2.s, z1.s\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
+ "addvl x22, x22, #2\n"
+ "addvl x21, x21, #2\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "fmla z20.s, p0/M, z7.s, z5.s\n"
- "fmla z21.s, p0/M, z6.s, z5.s\n"
- "fmla z22.s, p0/M, z2.s, z5.s\n"
- "fmla z23.s, p0/M, z7.s, z4.s\n"
+ "fmla z20.s, p0/M, z7.s, z3.s\n"
+ "fmla z21.s, p0/M, z6.s, z3.s\n"
+ "fmla z22.s, p0/M, z2.s, z3.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "fmla z24.s, p0/M, z6.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+ "fmla z23.s, p0/M, z7.s, z5.s\n"
+ "fmla z24.s, p0/M, z6.s, z5.s\n"
+ "fmla z25.s, p0/M, z2.s, z5.s\n"
"fmla z26.s, p0/M, z7.s, z0.s\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"fmla z27.s, p0/M, z6.s, z0.s\n"
"fmla z28.s, p0/M, z2.s, z0.s\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"fmla z29.s, p0/M, z7.s, z1.s\n"
"ld1w { z0.s }, p0/Z, [x24]\n"
"fmla z30.s, p0/M, z6.s, z1.s\n"
"fmla z31.s, p0/M, z2.s, z1.s\n"
"ld1w { z1.s }, p0/Z, [x22]\n"
"ld1w { z2.s }, p0/Z, [x21]\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"bge 4b\n"
"5:" // main loop skip
@@ -204,12 +204,12 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
- "addvl x21, x21, #1\n"
"fmla z20.s, p0/M, z0.s, z7.s\n"
"fmla z21.s, p0/M, z1.s, z7.s\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
+ "addvl x21, x21, #1\n"
"fmla z22.s, p0/M, z2.s, z7.s\n"
"fmla z23.s, p0/M, z0.s, z4.s\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z24.s, p0/M, z1.s, z4.s\n"
"fmla z25.s, p0/M, z2.s, z4.s\n"
"fmla z26.s, p0/M, z0.s, z5.s\n"
@@ -223,19 +223,19 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"ld1w { z5.s }, p0/Z, [x22]\n"
"ld1w { z4.s }, p0/Z, [x21]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+ "fmla z8.s, p0/M, z6.s, z3.s\n"
"ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
- "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
- "fmla z8.s, p0/M, z6.s, z3.s\n"
"fmla z9.s, p0/M, z5.s, z3.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
"fmla z10.s, p0/M, z4.s, z3.s\n"
"fmla z11.s, p0/M, z6.s, z2.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z12.s, p0/M, z5.s, z2.s\n"
"fmla z13.s, p0/M, z4.s, z2.s\n"
- "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z14.s, p0/M, z6.s, z1.s\n"
"fmla z15.s, p0/M, z5.s, z1.s\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
"fmla z16.s, p0/M, z4.s, z1.s\n"
"fmla z17.s, p0/M, z6.s, z0.s\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
@@ -258,10 +258,10 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"6:" // multiply loop done
"decw x26, ALL, MUL #3\n"
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
+ "cmp x26, XZR\n"
"st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "cmp x26, XZR\n"
"st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
index 05262b50cb..5f29a6ce3c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
@@ -54,46 +54,46 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"1:" // Height loop
"ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
"ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x25, %x[Apanel]\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
"ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cntw x23, ALL, MUL #2\n"
- "mov %x[Apanel], x25\n"
"add x22, x24, x20, LSL #2\n"
- "cmp x26, x23\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"decw x23\n"
- "mov x21, x24\n"
"cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
"mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "mov z11.b, #0x0\n"
"ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+ "mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
- "mov z13.b, #0x0\n"
"ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
- "cmp x20, #0x2\n"
+ "mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
+ "ld1w { z4.s }, p0/Z, [x24]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
+ "ld1w { z5.s }, p0/Z, [x22]\n"
"mov z17.b, #0x0\n"
- "ld1w { z4.s }, p0/Z, [x24]\n"
"mov z18.b, #0x0\n"
+ "ld1w { z6.s }, p0/Z, [x21]\n"
"mov z19.b, #0x0\n"
- "ld1w { z5.s }, p0/Z, [x22]\n"
"mov z20.b, #0x0\n"
"mov z21.b, #0x0\n"
- "ld1w { z6.s }, p0/Z, [x21]\n"
"mov z22.b, #0x0\n"
"mov z23.b, #0x0\n"
"mov z24.b, #0x0\n"
@@ -172,18 +172,18 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"ld1w { z6.s }, p0/Z, [x21]\n"
"bge 4b\n"
"5:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "addvl x24, x24, #1\n"
"fmla z8.s, z4.s, z0.s[0]\n"
- "addvl x22, x22, #1\n"
- "addvl x21, x21, #1\n"
"fmla z11.s, z4.s, z0.s[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z14.s, z4.s, z0.s[2]\n"
"fmla z17.s, z4.s, z0.s[3]\n"
+ "addvl x24, x24, #1\n"
"fmla z20.s, z4.s, z1.s[0]\n"
"fmla z23.s, z4.s, z1.s[1]\n"
+ "addvl x22, x22, #1\n"
"fmla z26.s, z4.s, z1.s[2]\n"
"fmla z29.s, z4.s, z1.s[3]\n"
+ "addvl x21, x21, #1\n"
"fmla z9.s, z5.s, z0.s[0]\n"
"fmla z12.s, z5.s, z0.s[1]\n"
"fmla z15.s, z5.s, z0.s[2]\n"
@@ -206,8 +206,8 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ld1w { z2.s }, p0/Z, [x24]\n"
"ld1w { z1.s }, p0/Z, [x22]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
"fmla z8.s, z2.s, z4.s[0]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
"fmla z11.s, z2.s, z4.s[1]\n"
"fmla z14.s, z2.s, z4.s[2]\n"
"fmla z17.s, z2.s, z4.s[3]\n"
@@ -234,10 +234,10 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"6:" // multiply loop done
"decw x26, ALL, MUL #3\n"
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
+ "cmp x26, XZR\n"
"st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "cmp x26, XZR\n"
"st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index d1f3c31de3..4de4101148 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 2> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 2> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index 739ee24050..688e7377b9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -48,19 +48,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -105,10 +103,10 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -141,8 +139,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -158,98 +156,98 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x64604228 // bfdot z8.s, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ ".inst 0x64604208 // bfdot z8.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460420a // bfdot z10.s, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x64684208 // bfdot z8.s, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6468420a // bfdot z10.s, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n"
".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n"
"ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n"
".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n"
+ "add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- ".inst 0x64604228 // bfdot z8.s, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ ".inst 0x64604208 // bfdot z8.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x2\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n"
".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x2\n"
".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n"
".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x2\n"
".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n"
".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n"
".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n"
+ "addvl x10, x10, #4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -271,10 +269,10 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -287,22 +285,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cbz x12, 16f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x12, x12, #4\n"
"b 18f\n"
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
@@ -321,8 +319,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov x28, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -341,38 +339,38 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64614228 // bfdot z8.s, z17.h, z1.h[0]\n"
".inst 0x6460422c // bfdot z12.s, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64614209 // bfdot z9.s, z16.h, z1.h[0]\n"
".inst 0x6460420d // bfdot z13.s, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6461422a // bfdot z10.s, z17.h, z1.h[0]\n"
".inst 0x6460422e // bfdot z14.s, z17.h, z0.h[0]\n"
"ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "cmp x27, #0x8\n"
".inst 0x6461420b // bfdot z11.s, z16.h, z1.h[0]\n"
".inst 0x6460420f // bfdot z15.s, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "add x26, x26, #0x10\n"
".inst 0x64694228 // bfdot z8.s, z17.h, z1.h[1]\n"
".inst 0x6468422c // bfdot z12.s, z17.h, z0.h[1]\n"
"ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "add x25, x25, #0x10\n"
".inst 0x64694209 // bfdot z9.s, z16.h, z1.h[1]\n"
".inst 0x6468420d // bfdot z13.s, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
".inst 0x6469422a // bfdot z10.s, z17.h, z1.h[1]\n"
".inst 0x6468422e // bfdot z14.s, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x6469420b // bfdot z11.s, z16.h, z1.h[1]\n"
".inst 0x6468420f // bfdot z15.s, z16.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x64714228 // bfdot z8.s, z17.h, z1.h[2]\n"
".inst 0x6470422c // bfdot z12.s, z17.h, z0.h[2]\n"
@@ -399,50 +397,50 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x2\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64604228 // bfdot z8.s, z17.h, z0.h[0]\n"
".inst 0x6461422c // bfdot z12.s, z17.h, z1.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n"
".inst 0x6461420d // bfdot z13.s, z16.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n"
".inst 0x6461422e // bfdot z14.s, z17.h, z1.h[0]\n"
+ "addvl x10, x10, #4\n"
".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n"
".inst 0x6461420f // bfdot z15.s, z16.h, z1.h[0]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n"
".inst 0x6469422c // bfdot z12.s, z17.h, z1.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n"
".inst 0x6469420d // bfdot z13.s, z16.h, z1.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x2\n"
".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n"
".inst 0x6469422e // bfdot z14.s, z17.h, z1.h[1]\n"
+ "addvl x10, x10, #4\n"
".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n"
".inst 0x6469420f // bfdot z15.s, z16.h, z1.h[1]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n"
".inst 0x6471422c // bfdot z12.s, z17.h, z1.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n"
".inst 0x6471420d // bfdot z13.s, z16.h, z1.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x2\n"
".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n"
".inst 0x6471422e // bfdot z14.s, z17.h, z1.h[2]\n"
+ "addvl x10, x10, #4\n"
".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n"
".inst 0x6471420f // bfdot z15.s, z16.h, z1.h[2]\n"
"ble 24f\n"
@@ -450,13 +448,13 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n"
".inst 0x6479422c // bfdot z12.s, z17.h, z1.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n"
".inst 0x6479420d // bfdot z13.s, z16.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n"
".inst 0x6479422e // bfdot z14.s, z17.h, z1.h[3]\n"
+ "addvl x10, x10, #4\n"
".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n"
".inst 0x6479420f // bfdot z15.s, z16.h, z1.h[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
@@ -465,11 +463,11 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cmp x28, x20\n"
"bne 19b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -493,20 +491,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -519,27 +517,27 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cbz x12, 29f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 31f\n"
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
@@ -566,8 +564,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov x28, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -589,37 +587,37 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z0.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
".inst 0x646242a8 // bfdot z8.s, z21.h, z2.h[0]\n"
".inst 0x646142ac // bfdot z12.s, z21.h, z1.h[0]\n"
- ".inst 0x64624289 // bfdot z9.s, z20.h, z2.h[0]\n"
- ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x646042b0 // bfdot z16.s, z21.h, z0.h[0]\n"
+ ".inst 0x64624289 // bfdot z9.s, z20.h, z2.h[0]\n"
"ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n"
".inst 0x64604291 // bfdot z17.s, z20.h, z0.h[0]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x27, #0x8\n"
".inst 0x646242aa // bfdot z10.s, z21.h, z2.h[0]\n"
".inst 0x646142ae // bfdot z14.s, z21.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x646042b2 // bfdot z18.s, z21.h, z0.h[0]\n"
- "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6462428b // bfdot z11.s, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6461428f // bfdot z15.s, z20.h, z1.h[0]\n"
".inst 0x64604293 // bfdot z19.s, z20.h, z0.h[0]\n"
"ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646a42a8 // bfdot z8.s, z21.h, z2.h[1]\n"
".inst 0x646942ac // bfdot z12.s, z21.h, z1.h[1]\n"
".inst 0x646842b0 // bfdot z16.s, z21.h, z0.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646a4289 // bfdot z9.s, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6469428d // bfdot z13.s, z20.h, z1.h[1]\n"
".inst 0x64684291 // bfdot z17.s, z20.h, z0.h[1]\n"
"ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
@@ -628,31 +626,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646942ae // bfdot z14.s, z21.h, z1.h[1]\n"
".inst 0x646842b2 // bfdot z18.s, z21.h, z0.h[1]\n"
".inst 0x646a428b // bfdot z11.s, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x6469428f // bfdot z15.s, z20.h, z1.h[1]\n"
".inst 0x64684293 // bfdot z19.s, z20.h, z0.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647242a8 // bfdot z8.s, z21.h, z2.h[2]\n"
".inst 0x647142ac // bfdot z12.s, z21.h, z1.h[2]\n"
".inst 0x647042b0 // bfdot z16.s, z21.h, z0.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x64724289 // bfdot z9.s, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x6471428d // bfdot z13.s, z20.h, z1.h[2]\n"
".inst 0x64704291 // bfdot z17.s, z20.h, z0.h[2]\n"
"ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647242aa // bfdot z10.s, z21.h, z2.h[2]\n"
".inst 0x647142ae // bfdot z14.s, z21.h, z1.h[2]\n"
".inst 0x647042b2 // bfdot z18.s, z21.h, z0.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6472428b // bfdot z11.s, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6471428f // bfdot z15.s, z20.h, z1.h[2]\n"
".inst 0x64704293 // bfdot z19.s, z20.h, z0.h[2]\n"
"ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647a42a8 // bfdot z8.s, z21.h, z2.h[3]\n"
".inst 0x647942ac // bfdot z12.s, z21.h, z1.h[3]\n"
".inst 0x647842b0 // bfdot z16.s, z21.h, z0.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647a4289 // bfdot z9.s, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x6479428d // bfdot z13.s, z20.h, z1.h[3]\n"
".inst 0x64784291 // bfdot z17.s, z20.h, z0.h[3]\n"
"ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
@@ -665,18 +663,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x2\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
".inst 0x646042a8 // bfdot z8.s, z21.h, z0.h[0]\n"
".inst 0x646142ac // bfdot z12.s, z21.h, z1.h[0]\n"
- ".inst 0x64604289 // bfdot z9.s, z20.h, z0.h[0]\n"
- ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x646242b0 // bfdot z16.s, z21.h, z2.h[0]\n"
+ ".inst 0x64604289 // bfdot z9.s, z20.h, z0.h[0]\n"
"ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n"
".inst 0x64624291 // bfdot z17.s, z20.h, z2.h[0]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -689,12 +687,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x646842a8 // bfdot z8.s, z21.h, z0.h[1]\n"
".inst 0x646942ac // bfdot z12.s, z21.h, z1.h[1]\n"
".inst 0x646a42b0 // bfdot z16.s, z21.h, z2.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64684289 // bfdot z9.s, z20.h, z0.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x6469428d // bfdot z13.s, z20.h, z1.h[1]\n"
".inst 0x646a4291 // bfdot z17.s, z20.h, z2.h[1]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -708,12 +706,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x647042a8 // bfdot z8.s, z21.h, z0.h[2]\n"
".inst 0x647142ac // bfdot z12.s, z21.h, z1.h[2]\n"
".inst 0x647242b0 // bfdot z16.s, z21.h, z2.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64704289 // bfdot z9.s, z20.h, z0.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x6471428d // bfdot z13.s, z20.h, z1.h[2]\n"
".inst 0x64724291 // bfdot z17.s, z20.h, z2.h[2]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -730,8 +728,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647842a8 // bfdot z8.s, z21.h, z0.h[3]\n"
".inst 0x647942ac // bfdot z12.s, z21.h, z1.h[3]\n"
".inst 0x647a42b0 // bfdot z16.s, z21.h, z2.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64784289 // bfdot z9.s, z20.h, z0.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6479428d // bfdot z13.s, z20.h, z1.h[3]\n"
".inst 0x647a4291 // bfdot z17.s, z20.h, z2.h[3]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -748,12 +746,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cmp x28, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p5/Z, [x21]\n"
"ld1rw { z20.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z21.s\n"
"fmin z9.s, p5/M, z9.s, z21.s\n"
@@ -785,24 +783,24 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -815,18 +813,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cbz x12, 42f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -834,13 +832,13 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
@@ -875,8 +873,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov x28, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -901,25 +899,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z3.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z2.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64634328 // bfdot z8.s, z25.h, z3.h[0]\n"
".inst 0x6462432c // bfdot z12.s, z25.h, z2.h[0]\n"
- ".inst 0x64634309 // bfdot z9.s, z24.h, z3.h[0]\n"
- ".inst 0x6462430d // bfdot z13.s, z24.h, z2.h[0]\n"
".inst 0x64614330 // bfdot z16.s, z25.h, z1.h[0]\n"
".inst 0x64604334 // bfdot z20.s, z25.h, z0.h[0]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x64634309 // bfdot z9.s, z24.h, z3.h[0]\n"
+ ".inst 0x6462430d // bfdot z13.s, z24.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x64614311 // bfdot z17.s, z24.h, z1.h[0]\n"
".inst 0x64604315 // bfdot z21.s, z24.h, z0.h[0]\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -948,9 +946,9 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646a432e // bfdot z14.s, z25.h, z2.h[1]\n"
".inst 0x64694332 // bfdot z18.s, z25.h, z1.h[1]\n"
".inst 0x64684336 // bfdot z22.s, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646b430b // bfdot z11.s, z24.h, z3.h[1]\n"
".inst 0x646a430f // bfdot z15.s, z24.h, z2.h[1]\n"
- "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x64694313 // bfdot z19.s, z24.h, z1.h[1]\n"
".inst 0x64684317 // bfdot z23.s, z24.h, z0.h[1]\n"
"ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
@@ -995,20 +993,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x2\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64604328 // bfdot z8.s, z25.h, z0.h[0]\n"
".inst 0x6461432c // bfdot z12.s, z25.h, z1.h[0]\n"
- ".inst 0x64604309 // bfdot z9.s, z24.h, z0.h[0]\n"
- ".inst 0x6461430d // bfdot z13.s, z24.h, z1.h[0]\n"
".inst 0x64624330 // bfdot z16.s, z25.h, z2.h[0]\n"
".inst 0x64634334 // bfdot z20.s, z25.h, z3.h[0]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x64604309 // bfdot z9.s, z24.h, z0.h[0]\n"
+ ".inst 0x6461430d // bfdot z13.s, z24.h, z1.h[0]\n"
".inst 0x64624311 // bfdot z17.s, z24.h, z2.h[0]\n"
".inst 0x64634315 // bfdot z21.s, z24.h, z3.h[0]\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1024,12 +1022,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x64684328 // bfdot z8.s, z25.h, z0.h[1]\n"
".inst 0x6469432c // bfdot z12.s, z25.h, z1.h[1]\n"
".inst 0x646a4330 // bfdot z16.s, z25.h, z2.h[1]\n"
".inst 0x646b4334 // bfdot z20.s, z25.h, z3.h[1]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x64684309 // bfdot z9.s, z24.h, z0.h[1]\n"
".inst 0x6469430d // bfdot z13.s, z24.h, z1.h[1]\n"
".inst 0x646a4311 // bfdot z17.s, z24.h, z2.h[1]\n"
@@ -1047,12 +1045,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x64704328 // bfdot z8.s, z25.h, z0.h[2]\n"
".inst 0x6471432c // bfdot z12.s, z25.h, z1.h[2]\n"
".inst 0x64724330 // bfdot z16.s, z25.h, z2.h[2]\n"
".inst 0x64734334 // bfdot z20.s, z25.h, z3.h[2]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x64704309 // bfdot z9.s, z24.h, z0.h[2]\n"
".inst 0x6471430d // bfdot z13.s, z24.h, z1.h[2]\n"
".inst 0x64724311 // bfdot z17.s, z24.h, z2.h[2]\n"
@@ -1095,13 +1093,13 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cmp x28, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p5/Z, [x21]\n"
"ld1rw { z24.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z25.s\n"
"fmin z9.s, p5/M, z9.s, z25.s\n"
@@ -1141,28 +1139,28 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1175,18 +1173,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cbz x12, 55f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1198,16 +1196,16 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22]\n"
@@ -1248,8 +1246,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov x28, #0x0\n"
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1277,29 +1275,29 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z4.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
".inst 0x646443a8 // bfdot z8.s, z29.h, z4.h[0]\n"
".inst 0x646343ac // bfdot z12.s, z29.h, z3.h[0]\n"
- ".inst 0x64644389 // bfdot z9.s, z28.h, z4.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x646243b0 // bfdot z16.s, z29.h, z2.h[0]\n"
".inst 0x646143b4 // bfdot z20.s, z29.h, z1.h[0]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646043b8 // bfdot z24.s, z29.h, z0.h[0]\n"
- ".inst 0x6463438d // bfdot z13.s, z28.h, z3.h[0]\n"
+ ".inst 0x64644389 // bfdot z9.s, z28.h, z4.h[0]\n"
"ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6463438d // bfdot z13.s, z28.h, z3.h[0]\n"
".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x64614395 // bfdot z21.s, z28.h, z1.h[0]\n"
".inst 0x64604399 // bfdot z25.s, z28.h, z0.h[0]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1308,8 +1306,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646243b2 // bfdot z18.s, z29.h, z2.h[0]\n"
".inst 0x646143b6 // bfdot z22.s, z29.h, z1.h[0]\n"
".inst 0x646043ba // bfdot z26.s, z29.h, z0.h[0]\n"
- "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6464438b // bfdot z11.s, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6463438f // bfdot z15.s, z28.h, z3.h[0]\n"
".inst 0x64624393 // bfdot z19.s, z28.h, z2.h[0]\n"
".inst 0x64614397 // bfdot z23.s, z28.h, z1.h[0]\n"
@@ -1320,8 +1318,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646a43b0 // bfdot z16.s, z29.h, z2.h[1]\n"
".inst 0x646943b4 // bfdot z20.s, z29.h, z1.h[1]\n"
".inst 0x646843b8 // bfdot z24.s, z29.h, z0.h[1]\n"
- "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646c4389 // bfdot z9.s, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646b438d // bfdot z13.s, z28.h, z3.h[1]\n"
".inst 0x646a4391 // bfdot z17.s, z28.h, z2.h[1]\n"
".inst 0x64694395 // bfdot z21.s, z28.h, z1.h[1]\n"
@@ -1334,8 +1332,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646943b6 // bfdot z22.s, z29.h, z1.h[1]\n"
".inst 0x646843ba // bfdot z26.s, z29.h, z0.h[1]\n"
".inst 0x646c438b // bfdot z11.s, z28.h, z4.h[1]\n"
- ".inst 0x646b438f // bfdot z15.s, z28.h, z3.h[1]\n"
"ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x646b438f // bfdot z15.s, z28.h, z3.h[1]\n"
".inst 0x646a4393 // bfdot z19.s, z28.h, z2.h[1]\n"
".inst 0x64694397 // bfdot z23.s, z28.h, z1.h[1]\n"
".inst 0x6468439b // bfdot z27.s, z28.h, z0.h[1]\n"
@@ -1345,8 +1343,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647243b0 // bfdot z16.s, z29.h, z2.h[2]\n"
".inst 0x647143b4 // bfdot z20.s, z29.h, z1.h[2]\n"
".inst 0x647043b8 // bfdot z24.s, z29.h, z0.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x64744389 // bfdot z9.s, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x6473438d // bfdot z13.s, z28.h, z3.h[2]\n"
".inst 0x64724391 // bfdot z17.s, z28.h, z2.h[2]\n"
".inst 0x64714395 // bfdot z21.s, z28.h, z1.h[2]\n"
@@ -1357,8 +1355,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647243b2 // bfdot z18.s, z29.h, z2.h[2]\n"
".inst 0x647143b6 // bfdot z22.s, z29.h, z1.h[2]\n"
".inst 0x647043ba // bfdot z26.s, z29.h, z0.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6474438b // bfdot z11.s, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6473438f // bfdot z15.s, z28.h, z3.h[2]\n"
".inst 0x64724393 // bfdot z19.s, z28.h, z2.h[2]\n"
".inst 0x64714397 // bfdot z23.s, z28.h, z1.h[2]\n"
@@ -1369,8 +1367,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647a43b0 // bfdot z16.s, z29.h, z2.h[3]\n"
".inst 0x647943b4 // bfdot z20.s, z29.h, z1.h[3]\n"
".inst 0x647843b8 // bfdot z24.s, z29.h, z0.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647c4389 // bfdot z9.s, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647b438d // bfdot z13.s, z28.h, z3.h[3]\n"
".inst 0x647a4391 // bfdot z17.s, z28.h, z2.h[3]\n"
".inst 0x64794395 // bfdot z21.s, z28.h, z1.h[3]\n"
@@ -1389,23 +1387,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x2\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
".inst 0x646043a8 // bfdot z8.s, z29.h, z0.h[0]\n"
".inst 0x646143ac // bfdot z12.s, z29.h, z1.h[0]\n"
- ".inst 0x64604389 // bfdot z9.s, z28.h, z0.h[0]\n"
- ".inst 0x6461438d // bfdot z13.s, z28.h, z1.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x646243b0 // bfdot z16.s, z29.h, z2.h[0]\n"
".inst 0x646343b4 // bfdot z20.s, z29.h, z3.h[0]\n"
".inst 0x646443b8 // bfdot z24.s, z29.h, z4.h[0]\n"
- ".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n"
+ ".inst 0x64604389 // bfdot z9.s, z28.h, z0.h[0]\n"
"ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461438d // bfdot z13.s, z28.h, z1.h[0]\n"
+ ".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n"
".inst 0x64634395 // bfdot z21.s, z28.h, z3.h[0]\n"
".inst 0x64644399 // bfdot z25.s, z28.h, z4.h[0]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1423,21 +1421,21 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x646843a8 // bfdot z8.s, z29.h, z0.h[1]\n"
".inst 0x646943ac // bfdot z12.s, z29.h, z1.h[1]\n"
".inst 0x646a43b0 // bfdot z16.s, z29.h, z2.h[1]\n"
".inst 0x646b43b4 // bfdot z20.s, z29.h, z3.h[1]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x646c43b8 // bfdot z24.s, z29.h, z4.h[1]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64684389 // bfdot z9.s, z28.h, z0.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6469438d // bfdot z13.s, z28.h, z1.h[1]\n"
".inst 0x646a4391 // bfdot z17.s, z28.h, z2.h[1]\n"
".inst 0x646b4395 // bfdot z21.s, z28.h, z3.h[1]\n"
".inst 0x646c4399 // bfdot z25.s, z28.h, z4.h[1]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646843aa // bfdot z10.s, z29.h, z0.h[1]\n"
"addvl x10, x10, #4\n"
+ ".inst 0x646843aa // bfdot z10.s, z29.h, z0.h[1]\n"
".inst 0x646943ae // bfdot z14.s, z29.h, z1.h[1]\n"
".inst 0x646a43b2 // bfdot z18.s, z29.h, z2.h[1]\n"
".inst 0x646b43b6 // bfdot z22.s, z29.h, z3.h[1]\n"
@@ -1450,21 +1448,21 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x647043a8 // bfdot z8.s, z29.h, z0.h[2]\n"
".inst 0x647143ac // bfdot z12.s, z29.h, z1.h[2]\n"
".inst 0x647243b0 // bfdot z16.s, z29.h, z2.h[2]\n"
".inst 0x647343b4 // bfdot z20.s, z29.h, z3.h[2]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x647443b8 // bfdot z24.s, z29.h, z4.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64704389 // bfdot z9.s, z28.h, z0.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6471438d // bfdot z13.s, z28.h, z1.h[2]\n"
".inst 0x64724391 // bfdot z17.s, z28.h, z2.h[2]\n"
".inst 0x64734395 // bfdot z21.s, z28.h, z3.h[2]\n"
".inst 0x64744399 // bfdot z25.s, z28.h, z4.h[2]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x647043aa // bfdot z10.s, z29.h, z0.h[2]\n"
"addvl x10, x10, #4\n"
+ ".inst 0x647043aa // bfdot z10.s, z29.h, z0.h[2]\n"
".inst 0x647143ae // bfdot z14.s, z29.h, z1.h[2]\n"
".inst 0x647243b2 // bfdot z18.s, z29.h, z2.h[2]\n"
".inst 0x647343b6 // bfdot z22.s, z29.h, z3.h[2]\n"
@@ -1482,8 +1480,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647a43b0 // bfdot z16.s, z29.h, z2.h[3]\n"
".inst 0x647b43b4 // bfdot z20.s, z29.h, z3.h[3]\n"
".inst 0x647c43b8 // bfdot z24.s, z29.h, z4.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x64784389 // bfdot z9.s, z28.h, z0.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6479438d // bfdot z13.s, z28.h, z1.h[3]\n"
".inst 0x647a4391 // bfdot z17.s, z28.h, z2.h[3]\n"
".inst 0x647b4395 // bfdot z21.s, z28.h, z3.h[3]\n"
@@ -1506,14 +1504,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cmp x28, x20\n"
"bne 58b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z29.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z29.s }, p5/Z, [x21]\n"
"ld1rw { z28.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z29.s\n"
"fmin z9.s, p5/M, z9.s, z29.s\n"
@@ -1561,22 +1559,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1584,13 +1582,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1603,18 +1600,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cbz x12, 68f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1630,17 +1627,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1689,8 +1686,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov x28, #0x0\n"
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1721,29 +1718,29 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x10]\n"
- "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z7.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z6.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z5.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
"ld1rqh { z2.h }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x64674028 // bfdot z8.s, z1.h, z7.h[0]\n"
".inst 0x6466402c // bfdot z12.s, z1.h, z6.h[0]\n"
- "add x21, x21, #0x10\n"
".inst 0x64654030 // bfdot z16.s, z1.h, z5.h[0]\n"
".inst 0x64644034 // bfdot z20.s, z1.h, z4.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x64634038 // bfdot z24.s, z1.h, z3.h[0]\n"
".inst 0x6462403c // bfdot z28.s, z1.h, z2.h[0]\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
".inst 0x64674009 // bfdot z9.s, z0.h, z7.h[0]\n"
".inst 0x6466400d // bfdot z13.s, z0.h, z6.h[0]\n"
".inst 0x64654011 // bfdot z17.s, z0.h, z5.h[0]\n"
@@ -1851,24 +1848,24 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x2\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
"ld1rqh { z5.h }, p0/Z, [x21]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n"
".inst 0x646140ec // bfdot z12.s, z7.h, z1.h[0]\n"
- ".inst 0x646040c9 // bfdot z9.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cd // bfdot z13.s, z6.h, z1.h[0]\n"
".inst 0x646240f0 // bfdot z16.s, z7.h, z2.h[0]\n"
".inst 0x646340f4 // bfdot z20.s, z7.h, z3.h[0]\n"
".inst 0x646440f8 // bfdot z24.s, z7.h, z4.h[0]\n"
".inst 0x646540fc // bfdot z28.s, z7.h, z5.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646040c9 // bfdot z9.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140cd // bfdot z13.s, z6.h, z1.h[0]\n"
".inst 0x646240d1 // bfdot z17.s, z6.h, z2.h[0]\n"
".inst 0x646340d5 // bfdot z21.s, z6.h, z3.h[0]\n"
".inst 0x646440d9 // bfdot z25.s, z6.h, z4.h[0]\n"
@@ -1890,23 +1887,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x646840e8 // bfdot z8.s, z7.h, z0.h[1]\n"
".inst 0x646940ec // bfdot z12.s, z7.h, z1.h[1]\n"
".inst 0x646a40f0 // bfdot z16.s, z7.h, z2.h[1]\n"
".inst 0x646b40f4 // bfdot z20.s, z7.h, z3.h[1]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x646c40f8 // bfdot z24.s, z7.h, z4.h[1]\n"
".inst 0x646d40fc // bfdot z28.s, z7.h, z5.h[1]\n"
- ".inst 0x646840c9 // bfdot z9.s, z6.h, z0.h[1]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646840c9 // bfdot z9.s, z6.h, z0.h[1]\n"
".inst 0x646940cd // bfdot z13.s, z6.h, z1.h[1]\n"
".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
".inst 0x646b40d5 // bfdot z21.s, z6.h, z3.h[1]\n"
".inst 0x646c40d9 // bfdot z25.s, z6.h, z4.h[1]\n"
".inst 0x646d40dd // bfdot z29.s, z6.h, z5.h[1]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646840ea // bfdot z10.s, z7.h, z0.h[1]\n"
"addvl x10, x10, #4\n"
+ ".inst 0x646840ea // bfdot z10.s, z7.h, z0.h[1]\n"
".inst 0x646940ee // bfdot z14.s, z7.h, z1.h[1]\n"
".inst 0x646a40f2 // bfdot z18.s, z7.h, z2.h[1]\n"
".inst 0x646b40f6 // bfdot z22.s, z7.h, z3.h[1]\n"
@@ -1921,23 +1918,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x2\n"
".inst 0x647040e8 // bfdot z8.s, z7.h, z0.h[2]\n"
".inst 0x647140ec // bfdot z12.s, z7.h, z1.h[2]\n"
".inst 0x647240f0 // bfdot z16.s, z7.h, z2.h[2]\n"
".inst 0x647340f4 // bfdot z20.s, z7.h, z3.h[2]\n"
+ "subs x27, x27, #0x2\n"
".inst 0x647440f8 // bfdot z24.s, z7.h, z4.h[2]\n"
".inst 0x647540fc // bfdot z28.s, z7.h, z5.h[2]\n"
- ".inst 0x647040c9 // bfdot z9.s, z6.h, z0.h[2]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x647040c9 // bfdot z9.s, z6.h, z0.h[2]\n"
".inst 0x647140cd // bfdot z13.s, z6.h, z1.h[2]\n"
".inst 0x647240d1 // bfdot z17.s, z6.h, z2.h[2]\n"
".inst 0x647340d5 // bfdot z21.s, z6.h, z3.h[2]\n"
".inst 0x647440d9 // bfdot z25.s, z6.h, z4.h[2]\n"
".inst 0x647540dd // bfdot z29.s, z6.h, z5.h[2]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x647040ea // bfdot z10.s, z7.h, z0.h[2]\n"
"addvl x10, x10, #4\n"
+ ".inst 0x647040ea // bfdot z10.s, z7.h, z0.h[2]\n"
".inst 0x647140ee // bfdot z14.s, z7.h, z1.h[2]\n"
".inst 0x647240f2 // bfdot z18.s, z7.h, z2.h[2]\n"
".inst 0x647340f6 // bfdot z22.s, z7.h, z3.h[2]\n"
@@ -1985,15 +1982,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"cmp x28, x20\n"
"bne 71b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p5/Z, [x21]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z1.s\n"
"fmin z9.s, p5/M, z9.s, z1.s\n"
@@ -2049,26 +2046,26 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z28.s }, p4, [x22]\n"
- "st1w { z29.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -2085,8 +2082,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
index 325499b7a3..d7ca55c295 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
index 64788ab092..ad997b0034 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -48,19 +48,18 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -105,10 +103,10 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -121,26 +119,26 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cbz x12, 3f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"b 5f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 4f\n"
- "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
"ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z16.d, z12.d\n"
+ "zip2 z12.d, z16.d, z12.d\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z8.d, z19.d, z12.d\n"
- "zip2 z12.d, z19.d, z12.d\n"
"zip1 z9.d, z18.d, z13.d\n"
"zip2 z13.d, z18.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
@@ -161,8 +159,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -178,87 +176,87 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z20.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "trn1 z19.d, z20.d, z18.d\n"
- "trn2 z20.d, z20.d, z18.d\n"
- ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
- "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e669 // bfmmla z9.s, z19.h, z1.h\n"
- "ld1h { z18.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6472e66a // bfmmla z10.s, z19.h, z18.h\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
- ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6470e68a // bfmmla z10.s, z20.h, z16.h\n"
+ ".inst 0x6471e68e // bfmmla z14.s, z20.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
+ "add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "subs x27, x27, #0x4\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "addvl x10, x10, #8\n"
"ble 11f\n"
- "ld1h { z16.h }, p5/Z, [x10]\n"
- "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6470e428 // bfmmla z8.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6471e42c // bfmmla z12.s, z1.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6470e429 // bfmmla z9.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6471e42d // bfmmla z13.s, z1.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6470e42a // bfmmla z10.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6471e42e // bfmmla z14.s, z1.h, z17.h\n"
- "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n"
+ ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6470e42b // bfmmla z11.s, z1.h, z16.h\n"
- ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -269,14 +267,14 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z10.d, z10.d, z14.d\n"
"uzp1 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z17.s\n"
- "fmin z9.s, p5/M, z9.s, z17.s\n"
- "fmin z10.s, p5/M, z10.s, z17.s\n"
- "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
"fmax z8.s, p5/M, z8.s, z16.s\n"
"fmax z9.s, p5/M, z9.s, z16.s\n"
"fmax z10.s, p5/M, z10.s, z16.s\n"
@@ -293,10 +291,10 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -309,38 +307,38 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cbz x12, 16f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"b 18f\n"
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z18.s }, p4/Z, [x9]\n"
- "ld1w { z16.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z17.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x20, x9, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z8.d, z18.d, z12.d\n"
- "zip2 z12.d, z18.d, z12.d\n"
- "zip1 z9.d, z16.d, z13.d\n"
- "zip2 z13.d, z16.d, z13.d\n"
- "zip1 z10.d, z5.d, z14.d\n"
- "zip2 z14.d, z5.d, z14.d\n"
- "zip1 z11.d, z17.d, z15.d\n"
- "zip2 z15.d, z17.d, z15.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
+ "zip2 z15.d, z16.d, z15.d\n"
"b 18f\n"
"17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -355,8 +353,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -375,90 +373,90 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "ld1rqh { z19.h }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z19.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z16.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "trn1 z18.d, z19.d, z16.d\n"
- "trn2 z19.d, z19.d, z16.d\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6474e64c // bfmmla z12.s, z18.h, z20.h\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
+ ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
+ ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n"
+ ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
+ ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n"
+ ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n"
+ ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n"
"ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
- ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
+ ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
+ ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
"ld1rqh { z19.h }, p0/Z, [x25]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "subs x27, x27, #0x4\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "addvl x10, x10, #8\n"
"ble 24f\n"
- "ld1h { z16.h }, p5/Z, [x10]\n"
- "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6470e428 // bfmmla z8.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6471e42c // bfmmla z12.s, z1.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6470e429 // bfmmla z9.s, z1.h, z16.h\n"
- "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6471e42d // bfmmla z13.s, z1.h, z17.h\n"
- "ld1h { z26.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6470e42a // bfmmla z10.s, z1.h, z16.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x647ae42e // bfmmla z14.s, z1.h, z26.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z22.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n"
+ ".inst 0x6476e42b // bfmmla z11.s, z1.h, z22.h\n"
".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
+ "addvl x10, x10, #8\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -467,17 +465,17 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z7.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
+ "add x25, x9, x20, LSL #2\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z17.s\n"
"fmin z12.s, p5/M, z12.s, z17.s\n"
@@ -501,20 +499,20 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -527,15 +525,15 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cbz x12, 29f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -550,36 +548,36 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z26.s }, p4/Z, [x9]\n"
- "ld1w { z25.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x21, x9, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
- "zip1 z8.d, z26.d, z12.d\n"
- "zip2 z12.d, z26.d, z12.d\n"
- "ld1w { z2.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z9.d, z25.d, z13.d\n"
- "zip2 z13.d, z25.d, z13.d\n"
- "zip1 z10.d, z24.d, z14.d\n"
- "zip2 z14.d, z24.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "zip1 z19.d, z2.d, z23.d\n"
- "zip2 z23.d, z2.d, z23.d\n"
+ "zip1 z19.d, z24.d, z23.d\n"
+ "zip2 z23.d, z24.d, z23.d\n"
"b 31f\n"
"30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -602,8 +600,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -625,92 +623,92 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z30.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z29.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
"ld1rqh { z24.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
"ld1rqh { z28.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "trn1 z27.d, z29.d, z24.d\n"
- "trn2 z29.d, z29.d, z24.d\n"
- "trn1 z26.d, z28.d, z31.d\n"
- "trn2 z28.d, z28.d, z31.d\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
- ".inst 0x647ee76c // bfmmla z12.s, z27.h, z30.h\n"
".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ee754 // bfmmla z20.s, z26.h, z30.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "cmp x27, #0x8\n"
".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
- "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
- ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
"ld1rqh { z24.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
"trn1 z27.d, z1.d, z24.d\n"
"trn2 z1.d, z1.d, z24.d\n"
- "trn1 z26.d, z3.d, z29.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- "trn2 z3.d, z3.d, z29.d\n"
".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
@@ -727,9 +725,9 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
- "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
@@ -754,24 +752,24 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 z16.d, z16.d, z20.d\n"
"uzp1 z17.d, z17.d, z21.d\n"
"uzp1 z18.d, z18.d, z22.d\n"
"uzp1 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p5/Z, [x21]\n"
"ld1rw { z24.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z25.s\n"
"fmin z12.s, p5/M, z12.s, z25.s\n"
@@ -803,24 +801,24 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -833,15 +831,15 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cbz x12, 42f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -856,37 +854,37 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x20]\n"
- "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -913,8 +911,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -939,114 +937,114 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z31.h }, p5/Z, [x10]\n"
- "ld1h { z30.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z29.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z25.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
"ld1rqh { z28.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqh { z24.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "trn1 z27.d, z29.d, z25.d\n"
- "trn2 z29.d, z29.d, z25.d\n"
- "trn1 z26.d, z28.d, z24.d\n"
- "trn2 z28.d, z28.d, z24.d\n"
- ".inst 0x647fe768 // bfmmla z8.s, z27.h, z31.h\n"
- ".inst 0x647ee76c // bfmmla z12.s, z27.h, z30.h\n"
- ".inst 0x647fe750 // bfmmla z16.s, z26.h, z31.h\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ee754 // bfmmla z20.s, z26.h, z30.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ "sub x27, x27, #0x8\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ "cmp x27, #0x8\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
- ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
- ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z25.h }, p0/Z, [x25]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z24.h }, p0/Z, [x23]\n"
- "trn1 z27.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- "trn1 z26.d, z3.d, z24.d\n"
- ".inst 0x647de768 // bfmmla z8.s, z27.h, z29.h\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- "trn2 z3.d, z3.d, z24.d\n"
- ".inst 0x647de750 // bfmmla z16.s, z26.h, z29.h\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ "subs x27, x27, #0x4\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
- "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
@@ -1071,17 +1069,17 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
@@ -1091,9 +1089,9 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z24.s }, p5/Z, [x21]\n"
"ld1rw { z23.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z24.s\n"
"fmin z12.s, p5/M, z12.s, z24.s\n"
@@ -1133,28 +1131,28 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1167,15 +1165,15 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cbz x12, 55f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -1198,46 +1196,46 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x21]\n"
- "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x20]\n"
- "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
@@ -1276,8 +1274,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1305,103 +1303,102 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x10]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z6.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x25]\n"
"ld1rqh { z7.h }, p0/Z, [x24]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
"trn1 z3.d, z7.d, z2.d\n"
"trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6461e488 // bfmmla z8.s, z4.h, z1.h\n"
+ ".inst 0x6461e4a8 // bfmmla z8.s, z5.h, z1.h\n"
".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n"
".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6460e48c // bfmmla z12.s, z4.h, z0.h\n"
+ "sub x27, x27, #0x8\n"
+ ".inst 0x6460e4ac // bfmmla z12.s, z5.h, z0.h\n"
".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e489 // bfmmla z9.s, z4.h, z1.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6460e48d // bfmmla z13.s, z4.h, z0.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n"
".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6461e48a // bfmmla z10.s, z4.h, z1.h\n"
+ ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n"
".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6460e48e // bfmmla z14.s, z4.h, z0.h\n"
+ ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n"
".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6461e48b // bfmmla z11.s, z4.h, z1.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
- ".inst 0x6460e48f // bfmmla z15.s, z4.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
- "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n"
- ".inst 0x6461e4b8 // bfmmla z24.s, z5.h, z1.h\n"
+ ".inst 0x6461e498 // bfmmla z24.s, z4.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n"
- ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n"
- ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n"
- ".inst 0x6461e4ba // bfmmla z26.s, z5.h, z1.h\n"
+ ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n"
- ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n"
- ".inst 0x6461e4bb // bfmmla z27.s, z5.h, z1.h\n"
+ ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n"
- ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z2.h }, p5/Z, [x10]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z6.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
@@ -1409,6 +1406,7 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
@@ -1429,8 +1427,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
+ "addvl x10, x10, #8\n"
".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
@@ -1442,24 +1440,24 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
@@ -1476,20 +1474,20 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 58b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
@@ -1501,9 +1499,9 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x21]\n"
"ld1rw { z23.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z0.s\n"
"fmin z12.s, p5/M, z12.s, z0.s\n"
@@ -1551,22 +1549,22 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1574,13 +1572,12 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1593,15 +1590,15 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cbz x12, 68f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -1624,54 +1621,54 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z28.s }, p4/Z, [x20]\n"
- "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
@@ -1707,8 +1704,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1739,113 +1736,113 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x10]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
- "ld1rqh { z6.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqh { z7.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqh { z2.h }, p0/Z, [x23]\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
"ld1rqh { z0.h }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "add x21, x21, #0x10\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6461e488 // bfmmla z8.s, z4.h, z1.h\n"
- ".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6460e48c // bfmmla z12.s, z4.h, z0.h\n"
- ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ "sub x27, x27, #0x8\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e489 // bfmmla z9.s, z4.h, z1.h\n"
- ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6460e48d // bfmmla z13.s, z4.h, z0.h\n"
- ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6461e48a // bfmmla z10.s, z4.h, z1.h\n"
- ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6460e48e // bfmmla z14.s, z4.h, z0.h\n"
- ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6461e48b // bfmmla z11.s, z4.h, z1.h\n"
- ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
- ".inst 0x6460e48f // bfmmla z15.s, z4.h, z0.h\n"
- ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
- ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
- ".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n"
- ".inst 0x6461e4b8 // bfmmla z24.s, z5.h, z1.h\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
- ".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n"
- ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
- ".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n"
- ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
- ".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n"
- ".inst 0x6461e4ba // bfmmla z26.s, z5.h, z1.h\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
- ".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n"
- ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
- ".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n"
- ".inst 0x6461e4bb // bfmmla z27.s, z5.h, z1.h\n"
- ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
- ".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n"
- ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z2.h }, p5/Z, [x10]\n"
- "subs x27, x27, #0x4\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z6.h }, p0/Z, [x25]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
"ld1rqh { z0.h }, p0/Z, [x21]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
"ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
"ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
@@ -1866,8 +1863,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
+ "addvl x10, x10, #8\n"
".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
@@ -1879,24 +1876,24 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
- "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
@@ -1913,21 +1910,21 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 71b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z7.d, z8.d, z12.d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
@@ -1943,9 +1940,9 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p5/Z, [x21]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
"fmin z7.s, p5/M, z7.s, z1.s\n"
"fmin z12.s, p5/M, z12.s, z1.s\n"
@@ -2001,26 +1998,26 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z23.s }, p4, [x23]\n"
- "st1w { z28.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z30.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x22]\n"
- "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z23.s }, p4, [x22]\n"
+ "st1w { z28.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z29.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z30.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -2037,8 +2034,8 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index ec754b6435..86d6ecab54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
index 93462461ad..8578733628 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -47,19 +47,18 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,10 +102,10 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cmp %x[M], #0x2\n"
"bgt 25f\n"
"beq 13f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p3.h, x20, x11\n"
@@ -140,8 +138,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -160,14 +158,14 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop
"fmla z8.h, p4/M, z6.h, z0.h\n"
- "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x2\n"
- "subs x27, x27, #0x1\n"
"fmla z10.h, p4/M, z17.h, z0.h\n"
"fmla z11.h, p4/M, z16.h, z0.h\n"
+ "subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -175,19 +173,19 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"10:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
- "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"fmla z10.h, p4/M, z17.h, z0.h\n"
"fmla z11.h, p4/M, z16.h, z0.h\n"
+ "addvl x10, x10, #4\n"
"bne 6b\n"
"tbz %x[flags], #1, 11f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p4/Z, [x21]\n"
"ld1rh { z16.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z17.h\n"
"fmin z9.h, p4/M, z9.h, z17.h\n"
@@ -209,10 +207,10 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"bgt 2b\n"
"b 74f\n"
"13:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"14:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p3.h, x20, x11\n"
@@ -225,22 +223,22 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cbz x12, 15f\n"
"ld1h { z8.h }, p4/Z, [x12]\n"
"ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x12, x12, #4\n"
"b 17f\n"
"15:" // Height 2: no bias
"tbz %x[flags], #0, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x20]\n"
"ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
"ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
@@ -259,8 +257,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -295,8 +293,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z10.h, p4/M, z17.h, z0.h\n"
"fmla z14.h, p4/M, z17.h, z1.h\n"
"fmla z11.h, p4/M, z16.h, z0.h\n"
- "ld1rh { z0.h }, p4/Z, [x26]\n"
"fmla z15.h, p4/M, z16.h, z1.h\n"
+ "ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -310,19 +308,19 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z13.h, p4/M, z7.h, z1.h\n"
"ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"fmla z10.h, p4/M, z17.h, z0.h\n"
"fmla z14.h, p4/M, z17.h, z1.h\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, p4/M, z16.h, z0.h\n"
"fmla z15.h, p4/M, z16.h, z1.h\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"tbz %x[flags], #1, 23f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p4/Z, [x21]\n"
"ld1rh { z16.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z17.h\n"
"fmin z9.h, p4/M, z9.h, z17.h\n"
@@ -346,20 +344,20 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
"24:" // Height 2: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 14b\n"
"b 74f\n"
"25:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"26:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p3.h, x20, x11\n"
@@ -372,27 +370,27 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cbz x12, 27f\n"
"ld1h { z8.h }, p4/Z, [x12]\n"
"ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 29f\n"
"27:" // Height 3: no bias
"tbz %x[flags], #0, 28f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x21]\n"
"ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
"ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
@@ -419,8 +417,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -451,8 +449,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x26, x26, #0x2\n"
"subs x27, x27, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
- "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
@@ -464,11 +462,11 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z18.h, p4/M, z21.h, z2.h\n"
"fmla z11.h, p4/M, z20.h, z0.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
+ "ld1h { z6.h }, p4/Z, [x10]\n"
"fmla z15.h, p4/M, z20.h, z1.h\n"
- "ld1rh { z1.h }, p4/Z, [x25]\n"
"fmla z19.h, p4/M, z20.h, z2.h\n"
+ "ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Main loop skip
@@ -477,13 +475,13 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z12.h, p4/M, z6.h, z1.h\n"
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
- "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "cmp x28, x20\n"
"fmla z10.h, p4/M, z21.h, z0.h\n"
"fmla z14.h, p4/M, z21.h, z1.h\n"
"fmla z18.h, p4/M, z21.h, z2.h\n"
@@ -492,12 +490,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z19.h, p4/M, z20.h, z2.h\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 35f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z21.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z21.h }, p4/Z, [x21]\n"
"ld1rh { z20.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z21.h\n"
"fmin z9.h, p4/M, z9.h, z21.h\n"
@@ -529,24 +527,24 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
"36:" // Height 3: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 26b\n"
"b 74f\n"
"37:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"38:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p3.h, x20, x11\n"
@@ -559,18 +557,18 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cbz x12, 39f\n"
"ld1h { z8.h }, p4/Z, [x12]\n"
"ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -578,13 +576,13 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"39:" // Height 4: no bias
"tbz %x[flags], #0, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x22]\n"
"ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
"ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
@@ -619,8 +617,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"42:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 43f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -670,6 +668,7 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z14.h, p4/M, z25.h, z1.h\n"
"fmla z18.h, p4/M, z25.h, z2.h\n"
"fmla z22.h, p4/M, z25.h, z3.h\n"
+ "ld1h { z6.h }, p4/Z, [x10]\n"
"fmla z11.h, p4/M, z24.h, z0.h\n"
"fmla z15.h, p4/M, z24.h, z1.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -678,7 +677,6 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z23.h, p4/M, z24.h, z3.h\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
"ld1rh { z3.h }, p4/Z, [x23]\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 45b\n"
"46:" // Height 4: Multiply loop: Main loop skip
@@ -689,15 +687,15 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
"ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, p4/M, z25.h, z0.h\n"
"fmla z14.h, p4/M, z25.h, z1.h\n"
- "addvl x10, x10, #4\n"
"fmla z18.h, p4/M, z25.h, z2.h\n"
"fmla z22.h, p4/M, z25.h, z3.h\n"
"fmla z11.h, p4/M, z24.h, z0.h\n"
@@ -706,13 +704,13 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z23.h, p4/M, z24.h, z3.h\n"
"bne 42b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 47f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z25.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z25.h }, p4/Z, [x21]\n"
"ld1rh { z24.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z25.h\n"
"fmin z9.h, p4/M, z9.h, z25.h\n"
@@ -752,28 +750,28 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p3, [x24]\n"
- "st1h { z21.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p1, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p3, [x23]\n"
+ "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
"48:" // Height 4: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 38b\n"
"b 74f\n"
"49:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"50:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p3.h, x20, x11\n"
@@ -786,18 +784,18 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cbz x12, 51f\n"
"ld1h { z8.h }, p4/Z, [x12]\n"
"ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -809,16 +807,16 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"51:" // Height 5: no bias
"tbz %x[flags], #0, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x23]\n"
"ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
"ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
"ld1h { z16.h }, p3/Z, [x22]\n"
@@ -859,8 +857,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"54:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 55f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -903,8 +901,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x25, x25, #0x2\n"
"add x24, x24, #0x2\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
- "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
@@ -922,12 +920,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
"fmla z15.h, p4/M, z28.h, z1.h\n"
- "ld1rh { z1.h }, p4/Z, [x25]\n"
"fmla z19.h, p4/M, z28.h, z2.h\n"
+ "ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
"fmla z23.h, p4/M, z28.h, z3.h\n"
- "ld1rh { z3.h }, p4/Z, [x23]\n"
"fmla z27.h, p4/M, z28.h, z4.h\n"
+ "ld1rh { z3.h }, p4/Z, [x23]\n"
"ld1rh { z4.h }, p4/Z, [x22]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 57b\n"
@@ -938,12 +936,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
+ "cmp x28, x20\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
- "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
+ "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
- "cmp x28, x20\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
"ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
@@ -960,14 +958,14 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z27.h, p4/M, z28.h, z4.h\n"
"bne 54b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 59f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z29.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z29.h }, p4/Z, [x21]\n"
"ld1rh { z28.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z29.h\n"
"fmin z9.h, p4/M, z9.h, z29.h\n"
@@ -1015,22 +1013,22 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p3, [x24]\n"
- "st1h { z21.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p1, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p0, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p3, [x23]\n"
- "st1h { z25.h }, p2, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p1, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p0, [x23, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p3, [x23]\n"
+ "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p3, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
"60:" // Height 5: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1038,13 +1036,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"b 74f\n"
"61:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0xc\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"62:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p3.h, x20, x11\n"
@@ -1057,18 +1054,18 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cbz x12, 63f\n"
"ld1h { z8.h }, p4/Z, [x12]\n"
"ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1084,17 +1081,17 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"63:" // Height 6: no bias
"tbz %x[flags], #0, 64f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
+ "add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
"ld1h { z12.h }, p3/Z, [x24]\n"
"ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
"ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
"ld1h { z16.h }, p3/Z, [x23]\n"
@@ -1143,8 +1140,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"66:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 67f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1232,12 +1229,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
+ "cmp x28, x20\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
"fmla z28.h, p4/M, z6.h, z5.h\n"
"ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
- "cmp x28, x20\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
@@ -1258,15 +1255,15 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"fmla z31.h, p4/M, z7.h, z5.h\n"
"bne 66b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"tbz %x[flags], #1, 71f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z1.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z1.h }, p4/Z, [x21]\n"
"ld1rh { z0.h }, p4/Z, [x20]\n"
"fmin z8.h, p4/M, z8.h, z1.h\n"
"fmin z9.h, p4/M, z9.h, z1.h\n"
@@ -1322,26 +1319,26 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p3, [x26]\n"
- "st1h { z13.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p1, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p0, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p3, [x25]\n"
- "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p1, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p0, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p3, [x24]\n"
- "st1h { z21.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p1, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p0, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p3, [x23]\n"
- "st1h { z25.h }, p2, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p1, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p0, [x23, #3, MUL VL]\n"
- "st1h { z28.h }, p3, [x22]\n"
- "st1h { z29.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z30.h }, p1, [x22, #2, MUL VL]\n"
- "st1h { z31.h }, p0, [x22, #3, MUL VL]\n"
+ "st1h { z12.h }, p3, [x25]\n"
+ "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p3, [x24]\n"
+ "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p3, [x23]\n"
+ "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p3, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
+ "st1h { z28.h }, p3, [x21]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z30.h }, p1, [x21, #2, MUL VL]\n"
+ "st1h { z31.h }, p0, [x21, #3, MUL VL]\n"
"72:" // Height 6: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1358,8 +1355,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"74:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
index 231472bcd0..64c4dfcd0e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -47,19 +47,18 @@ void sve_hybrid_fp16_mla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const __fp16 *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void sve_hybrid_fp16_mla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,10 +102,10 @@ void sve_hybrid_fp16_mla_6x4VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.h, x20, x11\n"
@@ -140,8 +138,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -157,56 +155,53 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "fmla z8.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z17.h, z0.h[1]\n"
"fmla z11.h, z16.h, z0.h[1]\n"
"ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
"fmla z11.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z17.h, z0.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"fmla z10.h, z17.h, z0.h[5]\n"
@@ -214,118 +209,121 @@ void sve_hybrid_fp16_mla_6x4VL (
"ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z17.h, z0.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[7]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sub x27, x27, #0x8\n"
+ "cmp x27, #0x8\n"
"fmla z10.h, z17.h, z0.h[7]\n"
"fmla z11.h, z16.h, z0.h[7]\n"
+ "add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "fmla z8.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.h, z17.h, z0.h[0]\n"
"fmla z11.h, z16.h, z0.h[0]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[1]\n"
"fmla z11.h, z16.h, z0.h[1]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[2]\n"
"fmla z11.h, z16.h, z0.h[2]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[3]\n"
"fmla z11.h, z16.h, z0.h[3]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[4]\n"
"fmla z11.h, z16.h, z0.h[4]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[5]\n"
"fmla z11.h, z16.h, z0.h[5]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[6]\n"
"fmla z11.h, z16.h, z0.h[6]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[7]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.h, z17.h, z0.h[7]\n"
"fmla z11.h, z16.h, z0.h[7]\n"
+ "addvl x10, x10, #4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p5/Z, [x21]\n"
"ld1rh { z16.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z17.h\n"
"fmin z9.h, p5/M, z9.h, z17.h\n"
@@ -347,10 +345,10 @@ void sve_hybrid_fp16_mla_6x4VL (
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.h, x20, x11\n"
@@ -363,22 +361,22 @@ void sve_hybrid_fp16_mla_6x4VL (
"cbz x12, 16f\n"
"ld1h { z8.h }, p5/Z, [x12]\n"
"ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x12, x12, #4\n"
"b 18f\n"
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x20]\n"
"ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
"ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
@@ -397,8 +395,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -417,38 +415,38 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z17.h, z1.h[0]\n"
"fmla z12.h, z17.h, z0.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z1.h[0]\n"
"fmla z13.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z17.h, z1.h[0]\n"
"fmla z14.h, z17.h, z0.h[0]\n"
"ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "cmp x27, #0x8\n"
"fmla z11.h, z16.h, z1.h[0]\n"
"fmla z15.h, z16.h, z0.h[0]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"fmla z8.h, z17.h, z1.h[1]\n"
"fmla z12.h, z17.h, z0.h[1]\n"
"ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z16.h, z1.h[1]\n"
"fmla z13.h, z16.h, z0.h[1]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"fmla z10.h, z17.h, z1.h[1]\n"
"fmla z14.h, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z16.h, z1.h[1]\n"
"fmla z15.h, z16.h, z0.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z17.h, z1.h[2]\n"
"fmla z12.h, z17.h, z0.h[2]\n"
@@ -495,9 +493,9 @@ void sve_hybrid_fp16_mla_6x4VL (
"addvl x10, x10, #16\n"
"fmla z10.h, z17.h, z1.h[5]\n"
"fmla z14.h, z17.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z16.h, z1.h[5]\n"
"fmla z15.h, z16.h, z0.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z17.h, z1.h[6]\n"
"fmla z12.h, z17.h, z0.h[6]\n"
@@ -524,110 +522,110 @@ void sve_hybrid_fp16_mla_6x4VL (
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[0]\n"
"fmla z12.h, z17.h, z1.h[0]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[0]\n"
"fmla z13.h, z16.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.h, z17.h, z0.h[0]\n"
"fmla z14.h, z17.h, z1.h[0]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[0]\n"
"fmla z15.h, z16.h, z1.h[0]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[1]\n"
"fmla z12.h, z17.h, z1.h[1]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[1]\n"
"fmla z13.h, z16.h, z1.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[1]\n"
"fmla z14.h, z17.h, z1.h[1]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[1]\n"
"fmla z15.h, z16.h, z1.h[1]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[2]\n"
"fmla z12.h, z17.h, z1.h[2]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[2]\n"
"fmla z13.h, z16.h, z1.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[2]\n"
"fmla z14.h, z17.h, z1.h[2]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[2]\n"
"fmla z15.h, z16.h, z1.h[2]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[3]\n"
"fmla z12.h, z17.h, z1.h[3]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[3]\n"
"fmla z13.h, z16.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[3]\n"
"fmla z14.h, z17.h, z1.h[3]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[3]\n"
"fmla z15.h, z16.h, z1.h[3]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[4]\n"
"fmla z12.h, z17.h, z1.h[4]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[4]\n"
"fmla z13.h, z16.h, z1.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[4]\n"
"fmla z14.h, z17.h, z1.h[4]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[4]\n"
"fmla z15.h, z16.h, z1.h[4]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[5]\n"
"fmla z12.h, z17.h, z1.h[5]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[5]\n"
"fmla z13.h, z16.h, z1.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[5]\n"
"fmla z14.h, z17.h, z1.h[5]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[5]\n"
"fmla z15.h, z16.h, z1.h[5]\n"
"ble 24f\n"
"ld1h { z17.h }, p5/Z, [x10]\n"
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z17.h, z0.h[6]\n"
"fmla z12.h, z17.h, z1.h[6]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[6]\n"
"fmla z13.h, z16.h, z1.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.h, z17.h, z0.h[6]\n"
"fmla z14.h, z17.h, z1.h[6]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[6]\n"
"fmla z15.h, z16.h, z1.h[6]\n"
"ble 24f\n"
@@ -635,13 +633,13 @@ void sve_hybrid_fp16_mla_6x4VL (
"ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z17.h, z0.h[7]\n"
"fmla z12.h, z17.h, z1.h[7]\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z16.h, z0.h[7]\n"
"fmla z13.h, z16.h, z1.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.h, z17.h, z0.h[7]\n"
"fmla z14.h, z17.h, z1.h[7]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z16.h, z0.h[7]\n"
"fmla z15.h, z16.h, z1.h[7]\n"
"24:" // Height 2: Multiply loop: multiply skip
@@ -650,11 +648,11 @@ void sve_hybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 19b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z17.h }, p5/Z, [x21]\n"
"ld1rh { z16.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z17.h\n"
"fmin z9.h, p5/M, z9.h, z17.h\n"
@@ -678,20 +676,20 @@ void sve_hybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.h, x20, x11\n"
@@ -704,27 +702,27 @@ void sve_hybrid_fp16_mla_6x4VL (
"cbz x12, 29f\n"
"ld1h { z8.h }, p5/Z, [x12]\n"
"ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 31f\n"
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x21]\n"
"ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
"ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
@@ -751,8 +749,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -774,37 +772,37 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z0.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"fmla z8.h, z21.h, z2.h[0]\n"
"fmla z12.h, z21.h, z1.h[0]\n"
- "fmla z9.h, z20.h, z2.h[0]\n"
- "fmla z13.h, z20.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.h, z21.h, z0.h[0]\n"
+ "fmla z9.h, z20.h, z2.h[0]\n"
"ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
"fmla z17.h, z20.h, z0.h[0]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x27, #0x8\n"
"fmla z10.h, z21.h, z2.h[0]\n"
"fmla z14.h, z21.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z18.h, z21.h, z0.h[0]\n"
- "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"fmla z15.h, z20.h, z1.h[0]\n"
"fmla z19.h, z20.h, z0.h[0]\n"
"ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[1]\n"
"fmla z12.h, z21.h, z1.h[1]\n"
"fmla z16.h, z21.h, z0.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[1]\n"
"fmla z17.h, z20.h, z0.h[1]\n"
"ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
@@ -813,63 +811,63 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z14.h, z21.h, z1.h[1]\n"
"fmla z18.h, z21.h, z0.h[1]\n"
"fmla z11.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[1]\n"
"fmla z19.h, z20.h, z0.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[2]\n"
"fmla z12.h, z21.h, z1.h[2]\n"
"fmla z16.h, z21.h, z0.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[2]\n"
"fmla z17.h, z20.h, z0.h[2]\n"
"ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[2]\n"
"fmla z14.h, z21.h, z1.h[2]\n"
"fmla z18.h, z21.h, z0.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[2]\n"
"fmla z19.h, z20.h, z0.h[2]\n"
"ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[3]\n"
"fmla z12.h, z21.h, z1.h[3]\n"
"fmla z16.h, z21.h, z0.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[3]\n"
"fmla z17.h, z20.h, z0.h[3]\n"
"ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[3]\n"
"fmla z14.h, z21.h, z1.h[3]\n"
"fmla z18.h, z21.h, z0.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
"fmla z11.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"fmla z15.h, z20.h, z1.h[3]\n"
"fmla z19.h, z20.h, z0.h[3]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[4]\n"
"fmla z12.h, z21.h, z1.h[4]\n"
"fmla z16.h, z21.h, z0.h[4]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[4]\n"
"fmla z17.h, z20.h, z0.h[4]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[4]\n"
"fmla z14.h, z21.h, z1.h[4]\n"
"fmla z18.h, z21.h, z0.h[4]\n"
- "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[4]\n"
"fmla z19.h, z20.h, z0.h[4]\n"
"ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[5]\n"
"fmla z12.h, z21.h, z1.h[5]\n"
"fmla z16.h, z21.h, z0.h[5]\n"
- "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[5]\n"
"fmla z17.h, z20.h, z0.h[5]\n"
"ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
@@ -878,31 +876,31 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z14.h, z21.h, z1.h[5]\n"
"fmla z18.h, z21.h, z0.h[5]\n"
"fmla z11.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[5]\n"
"fmla z19.h, z20.h, z0.h[5]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[6]\n"
"fmla z12.h, z21.h, z1.h[6]\n"
"fmla z16.h, z21.h, z0.h[6]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[6]\n"
"fmla z17.h, z20.h, z0.h[6]\n"
"ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z21.h, z2.h[6]\n"
"fmla z14.h, z21.h, z1.h[6]\n"
"fmla z18.h, z21.h, z0.h[6]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z15.h, z20.h, z1.h[6]\n"
"fmla z19.h, z20.h, z0.h[6]\n"
"ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z21.h, z2.h[7]\n"
"fmla z12.h, z21.h, z1.h[7]\n"
"fmla z16.h, z21.h, z0.h[7]\n"
- "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z20.h, z2.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[7]\n"
"fmla z17.h, z20.h, z0.h[7]\n"
"ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
@@ -915,18 +913,18 @@ void sve_hybrid_fp16_mla_6x4VL (
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z21.h }, p5/Z, [x10]\n"
- "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"fmla z8.h, z21.h, z0.h[0]\n"
"fmla z12.h, z21.h, z1.h[0]\n"
- "fmla z9.h, z20.h, z0.h[0]\n"
- "fmla z13.h, z20.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.h, z21.h, z2.h[0]\n"
+ "fmla z9.h, z20.h, z0.h[0]\n"
"ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
"fmla z17.h, z20.h, z2.h[0]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -939,12 +937,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z21.h, z0.h[1]\n"
"fmla z12.h, z21.h, z1.h[1]\n"
"fmla z16.h, z21.h, z2.h[1]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[1]\n"
"fmla z17.h, z20.h, z2.h[1]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -958,12 +956,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z21.h, z0.h[2]\n"
"fmla z12.h, z21.h, z1.h[2]\n"
"fmla z16.h, z21.h, z2.h[2]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[2]\n"
"fmla z17.h, z20.h, z2.h[2]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -977,12 +975,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z21.h, z0.h[3]\n"
"fmla z12.h, z21.h, z1.h[3]\n"
"fmla z16.h, z21.h, z2.h[3]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[3]\n"
"fmla z17.h, z20.h, z2.h[3]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -996,12 +994,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z21.h, z0.h[4]\n"
"fmla z12.h, z21.h, z1.h[4]\n"
"fmla z16.h, z21.h, z2.h[4]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[4]\n"
"fmla z17.h, z20.h, z2.h[4]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1015,12 +1013,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z21.h, z0.h[5]\n"
"fmla z12.h, z21.h, z1.h[5]\n"
"fmla z16.h, z21.h, z2.h[5]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[5]\n"
"fmla z17.h, z20.h, z2.h[5]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1034,12 +1032,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 37f\n"
"ld1h { z21.h }, p5/Z, [x10]\n"
"ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z21.h, z0.h[6]\n"
"fmla z12.h, z21.h, z1.h[6]\n"
"fmla z16.h, z21.h, z2.h[6]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.h, z20.h, z1.h[6]\n"
"fmla z17.h, z20.h, z2.h[6]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1056,8 +1054,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z8.h, z21.h, z0.h[7]\n"
"fmla z12.h, z21.h, z1.h[7]\n"
"fmla z16.h, z21.h, z2.h[7]\n"
- "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z20.h, z0.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z20.h, z1.h[7]\n"
"fmla z17.h, z20.h, z2.h[7]\n"
"ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1074,12 +1072,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z21.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z21.h }, p5/Z, [x21]\n"
"ld1rh { z20.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z21.h\n"
"fmin z9.h, p5/M, z9.h, z21.h\n"
@@ -1111,24 +1109,24 @@ void sve_hybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.h, x20, x11\n"
@@ -1141,18 +1139,18 @@ void sve_hybrid_fp16_mla_6x4VL (
"cbz x12, 42f\n"
"ld1h { z8.h }, p5/Z, [x12]\n"
"ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1160,13 +1158,13 @@ void sve_hybrid_fp16_mla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x22]\n"
"ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
"ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
@@ -1201,8 +1199,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1227,25 +1225,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z3.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z2.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z25.h, z3.h[0]\n"
"fmla z12.h, z25.h, z2.h[0]\n"
- "fmla z9.h, z24.h, z3.h[0]\n"
- "fmla z13.h, z24.h, z2.h[0]\n"
"fmla z16.h, z25.h, z1.h[0]\n"
"fmla z20.h, z25.h, z0.h[0]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z9.h, z24.h, z3.h[0]\n"
+ "fmla z13.h, z24.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z17.h, z24.h, z1.h[0]\n"
"fmla z21.h, z24.h, z0.h[0]\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1274,9 +1272,9 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z14.h, z25.h, z2.h[1]\n"
"fmla z18.h, z25.h, z1.h[1]\n"
"fmla z22.h, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z24.h, z3.h[1]\n"
"fmla z15.h, z24.h, z2.h[1]\n"
- "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z19.h, z24.h, z1.h[1]\n"
"fmla z23.h, z24.h, z0.h[1]\n"
"ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
@@ -1355,9 +1353,9 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z14.h, z25.h, z2.h[5]\n"
"fmla z18.h, z25.h, z1.h[5]\n"
"fmla z22.h, z25.h, z0.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z24.h, z3.h[5]\n"
"fmla z15.h, z24.h, z2.h[5]\n"
- "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z19.h, z24.h, z1.h[5]\n"
"fmla z23.h, z24.h, z0.h[5]\n"
"ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
@@ -1402,20 +1400,20 @@ void sve_hybrid_fp16_mla_6x4VL (
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z25.h, z0.h[0]\n"
"fmla z12.h, z25.h, z1.h[0]\n"
- "fmla z9.h, z24.h, z0.h[0]\n"
- "fmla z13.h, z24.h, z1.h[0]\n"
"fmla z16.h, z25.h, z2.h[0]\n"
"fmla z20.h, z25.h, z3.h[0]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[0]\n"
+ "fmla z13.h, z24.h, z1.h[0]\n"
"fmla z17.h, z24.h, z2.h[0]\n"
"fmla z21.h, z24.h, z3.h[0]\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1431,12 +1429,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z25.h, z0.h[1]\n"
"fmla z12.h, z25.h, z1.h[1]\n"
"fmla z16.h, z25.h, z2.h[1]\n"
"fmla z20.h, z25.h, z3.h[1]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[1]\n"
"fmla z13.h, z24.h, z1.h[1]\n"
"fmla z17.h, z24.h, z2.h[1]\n"
@@ -1454,12 +1452,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z25.h, z0.h[2]\n"
"fmla z12.h, z25.h, z1.h[2]\n"
"fmla z16.h, z25.h, z2.h[2]\n"
"fmla z20.h, z25.h, z3.h[2]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[2]\n"
"fmla z13.h, z24.h, z1.h[2]\n"
"fmla z17.h, z24.h, z2.h[2]\n"
@@ -1477,12 +1475,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z25.h, z0.h[3]\n"
"fmla z12.h, z25.h, z1.h[3]\n"
"fmla z16.h, z25.h, z2.h[3]\n"
"fmla z20.h, z25.h, z3.h[3]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[3]\n"
"fmla z13.h, z24.h, z1.h[3]\n"
"fmla z17.h, z24.h, z2.h[3]\n"
@@ -1500,12 +1498,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z25.h, z0.h[4]\n"
"fmla z12.h, z25.h, z1.h[4]\n"
"fmla z16.h, z25.h, z2.h[4]\n"
"fmla z20.h, z25.h, z3.h[4]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[4]\n"
"fmla z13.h, z24.h, z1.h[4]\n"
"fmla z17.h, z24.h, z2.h[4]\n"
@@ -1523,12 +1521,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z25.h, z0.h[5]\n"
"fmla z12.h, z25.h, z1.h[5]\n"
"fmla z16.h, z25.h, z2.h[5]\n"
"fmla z20.h, z25.h, z3.h[5]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[5]\n"
"fmla z13.h, z24.h, z1.h[5]\n"
"fmla z17.h, z24.h, z2.h[5]\n"
@@ -1546,12 +1544,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 50f\n"
"ld1h { z25.h }, p5/Z, [x10]\n"
"ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z25.h, z0.h[6]\n"
"fmla z12.h, z25.h, z1.h[6]\n"
"fmla z16.h, z25.h, z2.h[6]\n"
"fmla z20.h, z25.h, z3.h[6]\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.h, z24.h, z0.h[6]\n"
"fmla z13.h, z24.h, z1.h[6]\n"
"fmla z17.h, z24.h, z2.h[6]\n"
@@ -1594,13 +1592,13 @@ void sve_hybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z25.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z25.h }, p5/Z, [x21]\n"
"ld1rh { z24.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z25.h\n"
"fmin z9.h, p5/M, z9.h, z25.h\n"
@@ -1640,28 +1638,28 @@ void sve_hybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p4, [x24]\n"
- "st1h { z21.h }, p3, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x23]\n"
+ "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.h, x20, x11\n"
@@ -1674,18 +1672,18 @@ void sve_hybrid_fp16_mla_6x4VL (
"cbz x12, 55f\n"
"ld1h { z8.h }, p5/Z, [x12]\n"
"ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1697,16 +1695,16 @@ void sve_hybrid_fp16_mla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x23]\n"
"ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
"ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
"ld1h { z16.h }, p4/Z, [x22]\n"
@@ -1747,8 +1745,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1776,29 +1774,29 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z4.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z3.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"ld1rqh { z0.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"fmla z8.h, z29.h, z4.h[0]\n"
"fmla z12.h, z29.h, z3.h[0]\n"
- "fmla z9.h, z28.h, z4.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.h, z29.h, z2.h[0]\n"
"fmla z20.h, z29.h, z1.h[0]\n"
+ "add x25, x25, #0x10\n"
"fmla z24.h, z29.h, z0.h[0]\n"
- "fmla z13.h, z28.h, z3.h[0]\n"
+ "fmla z9.h, z28.h, z4.h[0]\n"
"ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z28.h, z3.h[0]\n"
"fmla z17.h, z28.h, z2.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z21.h, z28.h, z1.h[0]\n"
"fmla z25.h, z28.h, z0.h[0]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1807,8 +1805,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[0]\n"
"fmla z22.h, z29.h, z1.h[0]\n"
"fmla z26.h, z29.h, z0.h[0]\n"
- "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[0]\n"
"fmla z19.h, z28.h, z2.h[0]\n"
"fmla z23.h, z28.h, z1.h[0]\n"
@@ -1819,8 +1817,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[1]\n"
"fmla z20.h, z29.h, z1.h[1]\n"
"fmla z24.h, z29.h, z0.h[1]\n"
- "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[1]\n"
"fmla z17.h, z28.h, z2.h[1]\n"
"fmla z21.h, z28.h, z1.h[1]\n"
@@ -1833,8 +1831,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z22.h, z29.h, z1.h[1]\n"
"fmla z26.h, z29.h, z0.h[1]\n"
"fmla z11.h, z28.h, z4.h[1]\n"
- "fmla z15.h, z28.h, z3.h[1]\n"
"ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[1]\n"
"fmla z19.h, z28.h, z2.h[1]\n"
"fmla z23.h, z28.h, z1.h[1]\n"
"fmla z27.h, z28.h, z0.h[1]\n"
@@ -1844,8 +1842,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[2]\n"
"fmla z20.h, z29.h, z1.h[2]\n"
"fmla z24.h, z29.h, z0.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[2]\n"
"fmla z17.h, z28.h, z2.h[2]\n"
"fmla z21.h, z28.h, z1.h[2]\n"
@@ -1856,8 +1854,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[2]\n"
"fmla z22.h, z29.h, z1.h[2]\n"
"fmla z26.h, z29.h, z0.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[2]\n"
"fmla z19.h, z28.h, z2.h[2]\n"
"fmla z23.h, z28.h, z1.h[2]\n"
@@ -1868,8 +1866,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[3]\n"
"fmla z20.h, z29.h, z1.h[3]\n"
"fmla z24.h, z29.h, z0.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[3]\n"
"fmla z17.h, z28.h, z2.h[3]\n"
"fmla z21.h, z28.h, z1.h[3]\n"
@@ -1880,8 +1878,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[3]\n"
"fmla z22.h, z29.h, z1.h[3]\n"
"fmla z26.h, z29.h, z0.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
"fmla z11.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"fmla z15.h, z28.h, z3.h[3]\n"
"fmla z19.h, z28.h, z2.h[3]\n"
"fmla z23.h, z28.h, z1.h[3]\n"
@@ -1892,8 +1890,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[4]\n"
"fmla z20.h, z29.h, z1.h[4]\n"
"fmla z24.h, z29.h, z0.h[4]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[4]\n"
"fmla z17.h, z28.h, z2.h[4]\n"
"fmla z21.h, z28.h, z1.h[4]\n"
@@ -1904,8 +1902,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[4]\n"
"fmla z22.h, z29.h, z1.h[4]\n"
"fmla z26.h, z29.h, z0.h[4]\n"
- "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[4]\n"
"fmla z19.h, z28.h, z2.h[4]\n"
"fmla z23.h, z28.h, z1.h[4]\n"
@@ -1916,8 +1914,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[5]\n"
"fmla z20.h, z29.h, z1.h[5]\n"
"fmla z24.h, z29.h, z0.h[5]\n"
- "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[5]\n"
"fmla z17.h, z28.h, z2.h[5]\n"
"fmla z21.h, z28.h, z1.h[5]\n"
@@ -1930,8 +1928,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z22.h, z29.h, z1.h[5]\n"
"fmla z26.h, z29.h, z0.h[5]\n"
"fmla z11.h, z28.h, z4.h[5]\n"
- "fmla z15.h, z28.h, z3.h[5]\n"
"ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[5]\n"
"fmla z19.h, z28.h, z2.h[5]\n"
"fmla z23.h, z28.h, z1.h[5]\n"
"fmla z27.h, z28.h, z0.h[5]\n"
@@ -1941,8 +1939,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[6]\n"
"fmla z20.h, z29.h, z1.h[6]\n"
"fmla z24.h, z29.h, z0.h[6]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[6]\n"
"fmla z17.h, z28.h, z2.h[6]\n"
"fmla z21.h, z28.h, z1.h[6]\n"
@@ -1953,8 +1951,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z18.h, z29.h, z2.h[6]\n"
"fmla z22.h, z29.h, z1.h[6]\n"
"fmla z26.h, z29.h, z0.h[6]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z15.h, z28.h, z3.h[6]\n"
"fmla z19.h, z28.h, z2.h[6]\n"
"fmla z23.h, z28.h, z1.h[6]\n"
@@ -1965,8 +1963,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[7]\n"
"fmla z20.h, z29.h, z1.h[7]\n"
"fmla z24.h, z29.h, z0.h[7]\n"
- "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z28.h, z4.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z13.h, z28.h, z3.h[7]\n"
"fmla z17.h, z28.h, z2.h[7]\n"
"fmla z21.h, z28.h, z1.h[7]\n"
@@ -1985,23 +1983,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"fmla z8.h, z29.h, z0.h[0]\n"
"fmla z12.h, z29.h, z1.h[0]\n"
- "fmla z9.h, z28.h, z0.h[0]\n"
- "fmla z13.h, z28.h, z1.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.h, z29.h, z2.h[0]\n"
"fmla z20.h, z29.h, z3.h[0]\n"
"fmla z24.h, z29.h, z4.h[0]\n"
- "fmla z17.h, z28.h, z2.h[0]\n"
+ "fmla z9.h, z28.h, z0.h[0]\n"
"ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[0]\n"
+ "fmla z17.h, z28.h, z2.h[0]\n"
"fmla z21.h, z28.h, z3.h[0]\n"
"fmla z25.h, z28.h, z4.h[0]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -2019,21 +2017,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z29.h, z0.h[1]\n"
"fmla z12.h, z29.h, z1.h[1]\n"
"fmla z16.h, z29.h, z2.h[1]\n"
"fmla z20.h, z29.h, z3.h[1]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z29.h, z4.h[1]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[1]\n"
"fmla z17.h, z28.h, z2.h[1]\n"
"fmla z21.h, z28.h, z3.h[1]\n"
"fmla z25.h, z28.h, z4.h[1]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z29.h, z0.h[1]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z29.h, z0.h[1]\n"
"fmla z14.h, z29.h, z1.h[1]\n"
"fmla z18.h, z29.h, z2.h[1]\n"
"fmla z22.h, z29.h, z3.h[1]\n"
@@ -2046,21 +2044,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z29.h, z0.h[2]\n"
"fmla z12.h, z29.h, z1.h[2]\n"
"fmla z16.h, z29.h, z2.h[2]\n"
"fmla z20.h, z29.h, z3.h[2]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z29.h, z4.h[2]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[2]\n"
"fmla z17.h, z28.h, z2.h[2]\n"
"fmla z21.h, z28.h, z3.h[2]\n"
"fmla z25.h, z28.h, z4.h[2]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z29.h, z0.h[2]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z29.h, z0.h[2]\n"
"fmla z14.h, z29.h, z1.h[2]\n"
"fmla z18.h, z29.h, z2.h[2]\n"
"fmla z22.h, z29.h, z3.h[2]\n"
@@ -2073,21 +2071,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z29.h, z0.h[3]\n"
"fmla z12.h, z29.h, z1.h[3]\n"
"fmla z16.h, z29.h, z2.h[3]\n"
"fmla z20.h, z29.h, z3.h[3]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z29.h, z4.h[3]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[3]\n"
"fmla z17.h, z28.h, z2.h[3]\n"
"fmla z21.h, z28.h, z3.h[3]\n"
"fmla z25.h, z28.h, z4.h[3]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z29.h, z0.h[3]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z29.h, z0.h[3]\n"
"fmla z14.h, z29.h, z1.h[3]\n"
"fmla z18.h, z29.h, z2.h[3]\n"
"fmla z22.h, z29.h, z3.h[3]\n"
@@ -2100,21 +2098,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z29.h, z0.h[4]\n"
"fmla z12.h, z29.h, z1.h[4]\n"
"fmla z16.h, z29.h, z2.h[4]\n"
"fmla z20.h, z29.h, z3.h[4]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z29.h, z4.h[4]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[4]\n"
"fmla z17.h, z28.h, z2.h[4]\n"
"fmla z21.h, z28.h, z3.h[4]\n"
"fmla z25.h, z28.h, z4.h[4]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z29.h, z0.h[4]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z29.h, z0.h[4]\n"
"fmla z14.h, z29.h, z1.h[4]\n"
"fmla z18.h, z29.h, z2.h[4]\n"
"fmla z22.h, z29.h, z3.h[4]\n"
@@ -2127,21 +2125,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z29.h, z0.h[5]\n"
"fmla z12.h, z29.h, z1.h[5]\n"
"fmla z16.h, z29.h, z2.h[5]\n"
"fmla z20.h, z29.h, z3.h[5]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z29.h, z4.h[5]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[5]\n"
"fmla z17.h, z28.h, z2.h[5]\n"
"fmla z21.h, z28.h, z3.h[5]\n"
"fmla z25.h, z28.h, z4.h[5]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z29.h, z0.h[5]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z29.h, z0.h[5]\n"
"fmla z14.h, z29.h, z1.h[5]\n"
"fmla z18.h, z29.h, z2.h[5]\n"
"fmla z22.h, z29.h, z3.h[5]\n"
@@ -2154,21 +2152,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 63f\n"
"ld1h { z29.h }, p5/Z, [x10]\n"
"ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z29.h, z0.h[6]\n"
"fmla z12.h, z29.h, z1.h[6]\n"
"fmla z16.h, z29.h, z2.h[6]\n"
"fmla z20.h, z29.h, z3.h[6]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z29.h, z4.h[6]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[6]\n"
"fmla z17.h, z28.h, z2.h[6]\n"
"fmla z21.h, z28.h, z3.h[6]\n"
"fmla z25.h, z28.h, z4.h[6]\n"
"ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z29.h, z0.h[6]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z29.h, z0.h[6]\n"
"fmla z14.h, z29.h, z1.h[6]\n"
"fmla z18.h, z29.h, z2.h[6]\n"
"fmla z22.h, z29.h, z3.h[6]\n"
@@ -2186,8 +2184,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z16.h, z29.h, z2.h[7]\n"
"fmla z20.h, z29.h, z3.h[7]\n"
"fmla z24.h, z29.h, z4.h[7]\n"
- "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z28.h, z0.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z28.h, z1.h[7]\n"
"fmla z17.h, z28.h, z2.h[7]\n"
"fmla z21.h, z28.h, z3.h[7]\n"
@@ -2210,14 +2208,14 @@ void sve_hybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 58b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z29.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z29.h }, p5/Z, [x21]\n"
"ld1rh { z28.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z29.h\n"
"fmin z9.h, p5/M, z9.h, z29.h\n"
@@ -2265,22 +2263,22 @@ void sve_hybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p4, [x24]\n"
- "st1h { z21.h }, p3, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p4, [x23]\n"
- "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x23]\n"
+ "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p4, [x22]\n"
+ "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -2288,13 +2286,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0xc\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.h, x20, x11\n"
@@ -2307,18 +2304,18 @@ void sve_hybrid_fp16_mla_6x4VL (
"cbz x12, 68f\n"
"ld1h { z8.h }, p5/Z, [x12]\n"
"ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -2334,17 +2331,17 @@ void sve_hybrid_fp16_mla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #1\n"
+ "add x23, x24, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
+ "add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
"ld1h { z12.h }, p4/Z, [x24]\n"
"ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
- "add x20, x21, x20, LSL #1\n"
"ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
"ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
"ld1h { z16.h }, p4/Z, [x23]\n"
@@ -2393,8 +2390,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov x28, #0x0\n"
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2425,29 +2422,29 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1h { z1.h }, p5/Z, [x10]\n"
- "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x8\n"
- "cmp x27, #0x8\n"
"ld1rqh { z7.h }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqh { z6.h }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x8\n"
"ld1rqh { z5.h }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqh { z4.h }, p0/Z, [x23]\n"
+ "cmp x27, #0x8\n"
+ "add x26, x26, #0x10\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
"ld1rqh { z2.h }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z1.h, z7.h[0]\n"
"fmla z12.h, z1.h, z6.h[0]\n"
- "add x21, x21, #0x10\n"
"fmla z16.h, z1.h, z5.h[0]\n"
"fmla z20.h, z1.h, z4.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z24.h, z1.h, z3.h[0]\n"
"fmla z28.h, z1.h, z2.h[0]\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"fmla z9.h, z0.h, z7.h[0]\n"
"fmla z13.h, z0.h, z6.h[0]\n"
"fmla z17.h, z0.h, z5.h[0]\n"
@@ -2668,24 +2665,24 @@ void sve_hybrid_fp16_mla_6x4VL (
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
"ld1rqh { z5.h }, p0/Z, [x21]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z7.h, z0.h[0]\n"
"fmla z12.h, z7.h, z1.h[0]\n"
- "fmla z9.h, z6.h, z0.h[0]\n"
- "fmla z13.h, z6.h, z1.h[0]\n"
"fmla z16.h, z7.h, z2.h[0]\n"
"fmla z20.h, z7.h, z3.h[0]\n"
"fmla z24.h, z7.h, z4.h[0]\n"
"fmla z28.h, z7.h, z5.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[0]\n"
+ "fmla z13.h, z6.h, z1.h[0]\n"
"fmla z17.h, z6.h, z2.h[0]\n"
"fmla z21.h, z6.h, z3.h[0]\n"
"fmla z25.h, z6.h, z4.h[0]\n"
@@ -2707,23 +2704,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z7.h, z0.h[1]\n"
"fmla z12.h, z7.h, z1.h[1]\n"
"fmla z16.h, z7.h, z2.h[1]\n"
"fmla z20.h, z7.h, z3.h[1]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z7.h, z4.h[1]\n"
"fmla z28.h, z7.h, z5.h[1]\n"
- "fmla z9.h, z6.h, z0.h[1]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[1]\n"
"fmla z13.h, z6.h, z1.h[1]\n"
"fmla z17.h, z6.h, z2.h[1]\n"
"fmla z21.h, z6.h, z3.h[1]\n"
"fmla z25.h, z6.h, z4.h[1]\n"
"fmla z29.h, z6.h, z5.h[1]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z7.h, z0.h[1]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z7.h, z0.h[1]\n"
"fmla z14.h, z7.h, z1.h[1]\n"
"fmla z18.h, z7.h, z2.h[1]\n"
"fmla z22.h, z7.h, z3.h[1]\n"
@@ -2738,23 +2735,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z7.h, z0.h[2]\n"
"fmla z12.h, z7.h, z1.h[2]\n"
"fmla z16.h, z7.h, z2.h[2]\n"
"fmla z20.h, z7.h, z3.h[2]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z7.h, z4.h[2]\n"
"fmla z28.h, z7.h, z5.h[2]\n"
- "fmla z9.h, z6.h, z0.h[2]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[2]\n"
"fmla z13.h, z6.h, z1.h[2]\n"
"fmla z17.h, z6.h, z2.h[2]\n"
"fmla z21.h, z6.h, z3.h[2]\n"
"fmla z25.h, z6.h, z4.h[2]\n"
"fmla z29.h, z6.h, z5.h[2]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z7.h, z0.h[2]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z7.h, z0.h[2]\n"
"fmla z14.h, z7.h, z1.h[2]\n"
"fmla z18.h, z7.h, z2.h[2]\n"
"fmla z22.h, z7.h, z3.h[2]\n"
@@ -2769,23 +2766,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z7.h, z0.h[3]\n"
"fmla z12.h, z7.h, z1.h[3]\n"
"fmla z16.h, z7.h, z2.h[3]\n"
"fmla z20.h, z7.h, z3.h[3]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z7.h, z4.h[3]\n"
"fmla z28.h, z7.h, z5.h[3]\n"
- "fmla z9.h, z6.h, z0.h[3]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[3]\n"
"fmla z13.h, z6.h, z1.h[3]\n"
"fmla z17.h, z6.h, z2.h[3]\n"
"fmla z21.h, z6.h, z3.h[3]\n"
"fmla z25.h, z6.h, z4.h[3]\n"
"fmla z29.h, z6.h, z5.h[3]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z7.h, z0.h[3]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z7.h, z0.h[3]\n"
"fmla z14.h, z7.h, z1.h[3]\n"
"fmla z18.h, z7.h, z2.h[3]\n"
"fmla z22.h, z7.h, z3.h[3]\n"
@@ -2800,23 +2797,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z7.h, z0.h[4]\n"
"fmla z12.h, z7.h, z1.h[4]\n"
"fmla z16.h, z7.h, z2.h[4]\n"
"fmla z20.h, z7.h, z3.h[4]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z7.h, z4.h[4]\n"
"fmla z28.h, z7.h, z5.h[4]\n"
- "fmla z9.h, z6.h, z0.h[4]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[4]\n"
"fmla z13.h, z6.h, z1.h[4]\n"
"fmla z17.h, z6.h, z2.h[4]\n"
"fmla z21.h, z6.h, z3.h[4]\n"
"fmla z25.h, z6.h, z4.h[4]\n"
"fmla z29.h, z6.h, z5.h[4]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z7.h, z0.h[4]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z7.h, z0.h[4]\n"
"fmla z14.h, z7.h, z1.h[4]\n"
"fmla z18.h, z7.h, z2.h[4]\n"
"fmla z22.h, z7.h, z3.h[4]\n"
@@ -2831,23 +2828,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z7.h, z0.h[5]\n"
"fmla z12.h, z7.h, z1.h[5]\n"
"fmla z16.h, z7.h, z2.h[5]\n"
"fmla z20.h, z7.h, z3.h[5]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z7.h, z4.h[5]\n"
"fmla z28.h, z7.h, z5.h[5]\n"
- "fmla z9.h, z6.h, z0.h[5]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[5]\n"
"fmla z13.h, z6.h, z1.h[5]\n"
"fmla z17.h, z6.h, z2.h[5]\n"
"fmla z21.h, z6.h, z3.h[5]\n"
"fmla z25.h, z6.h, z4.h[5]\n"
"fmla z29.h, z6.h, z5.h[5]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z7.h, z0.h[5]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z7.h, z0.h[5]\n"
"fmla z14.h, z7.h, z1.h[5]\n"
"fmla z18.h, z7.h, z2.h[5]\n"
"fmla z22.h, z7.h, z3.h[5]\n"
@@ -2862,23 +2859,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"ble 76f\n"
"ld1h { z7.h }, p5/Z, [x10]\n"
"ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.h, z7.h, z0.h[6]\n"
"fmla z12.h, z7.h, z1.h[6]\n"
"fmla z16.h, z7.h, z2.h[6]\n"
"fmla z20.h, z7.h, z3.h[6]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.h, z7.h, z4.h[6]\n"
"fmla z28.h, z7.h, z5.h[6]\n"
- "fmla z9.h, z6.h, z0.h[6]\n"
"ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[6]\n"
"fmla z13.h, z6.h, z1.h[6]\n"
"fmla z17.h, z6.h, z2.h[6]\n"
"fmla z21.h, z6.h, z3.h[6]\n"
"fmla z25.h, z6.h, z4.h[6]\n"
"fmla z29.h, z6.h, z5.h[6]\n"
"ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z7.h, z0.h[6]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.h, z7.h, z0.h[6]\n"
"fmla z14.h, z7.h, z1.h[6]\n"
"fmla z18.h, z7.h, z2.h[6]\n"
"fmla z22.h, z7.h, z3.h[6]\n"
@@ -2926,15 +2923,15 @@ void sve_hybrid_fp16_mla_6x4VL (
"cmp x28, x20\n"
"bne 71b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #1\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z1.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z1.h }, p5/Z, [x21]\n"
"ld1rh { z0.h }, p5/Z, [x20]\n"
"fmin z8.h, p5/M, z8.h, z1.h\n"
"fmin z9.h, p5/M, z9.h, z1.h\n"
@@ -2990,26 +2987,26 @@ void sve_hybrid_fp16_mla_6x4VL (
"st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
"st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1h { z12.h }, p4, [x26]\n"
- "st1h { z13.h }, p3, [x26, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x26, #3, MUL VL]\n"
- "st1h { z16.h }, p4, [x25]\n"
- "st1h { z17.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x25, #3, MUL VL]\n"
- "st1h { z20.h }, p4, [x24]\n"
- "st1h { z21.h }, p3, [x24, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x24, #3, MUL VL]\n"
- "st1h { z24.h }, p4, [x23]\n"
- "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
- "st1h { z28.h }, p4, [x22]\n"
- "st1h { z29.h }, p3, [x22, #1, MUL VL]\n"
- "st1h { z30.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z31.h }, p1, [x22, #3, MUL VL]\n"
+ "st1h { z12.h }, p4, [x25]\n"
+ "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x24]\n"
+ "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x23]\n"
+ "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z24.h }, p4, [x22]\n"
+ "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
+ "st1h { z28.h }, p4, [x21]\n"
+ "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"dech x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -3026,8 +3023,8 @@ void sve_hybrid_fp16_mla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
index 3e040b6197..7936eeb11a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
index 2b836659a2..430c081288 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -47,19 +47,18 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,10 +102,10 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cmp %x[M], #0x2\n"
"bgt 25f\n"
"beq 13f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -140,8 +138,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -160,14 +158,14 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop
"fmla z8.s, p4/M, z6.s, z0.s\n"
- "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x4\n"
- "subs x27, x27, #0x1\n"
"fmla z10.s, p4/M, z17.s, z0.s\n"
"fmla z11.s, p4/M, z16.s, z0.s\n"
+ "subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -175,19 +173,19 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"10:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
- "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"fmla z10.s, p4/M, z17.s, z0.s\n"
"fmla z11.s, p4/M, z16.s, z0.s\n"
+ "addvl x10, x10, #4\n"
"bne 6b\n"
"tbz %x[flags], #1, 11f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p4/Z, [x21]\n"
"ld1rw { z16.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z17.s\n"
"fmin z9.s, p4/M, z9.s, z17.s\n"
@@ -209,10 +207,10 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"bgt 2b\n"
"b 74f\n"
"13:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"14:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -225,22 +223,22 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cbz x12, 15f\n"
"ld1w { z8.s }, p4/Z, [x12]\n"
"ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x12, x12, #4\n"
"b 17f\n"
"15:" // Height 2: no bias
"tbz %x[flags], #0, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x20]\n"
"ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
@@ -259,8 +257,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -295,8 +293,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z10.s, p4/M, z17.s, z0.s\n"
"fmla z14.s, p4/M, z17.s, z1.s\n"
"fmla z11.s, p4/M, z16.s, z0.s\n"
- "ld1rw { z0.s }, p4/Z, [x26]\n"
"fmla z15.s, p4/M, z16.s, z1.s\n"
+ "ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -310,19 +308,19 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z13.s, p4/M, z7.s, z1.s\n"
"ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"fmla z10.s, p4/M, z17.s, z0.s\n"
"fmla z14.s, p4/M, z17.s, z1.s\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, p4/M, z16.s, z0.s\n"
"fmla z15.s, p4/M, z16.s, z1.s\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 23f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p4/Z, [x21]\n"
"ld1rw { z16.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z17.s\n"
"fmin z9.s, p4/M, z9.s, z17.s\n"
@@ -346,20 +344,20 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
"24:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 14b\n"
"b 74f\n"
"25:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"26:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -372,27 +370,27 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cbz x12, 27f\n"
"ld1w { z8.s }, p4/Z, [x12]\n"
"ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 29f\n"
"27:" // Height 3: no bias
"tbz %x[flags], #0, 28f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x21]\n"
"ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
@@ -419,8 +417,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -451,8 +449,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x26, x26, #0x4\n"
"subs x27, x27, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
- "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
@@ -464,11 +462,11 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z18.s, p4/M, z21.s, z2.s\n"
"fmla z11.s, p4/M, z20.s, z0.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
+ "ld1w { z6.s }, p4/Z, [x10]\n"
"fmla z15.s, p4/M, z20.s, z1.s\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"fmla z19.s, p4/M, z20.s, z2.s\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Main loop skip
@@ -477,13 +475,13 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z12.s, p4/M, z6.s, z1.s\n"
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
- "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "cmp x28, x20\n"
"fmla z10.s, p4/M, z21.s, z0.s\n"
"fmla z14.s, p4/M, z21.s, z1.s\n"
"fmla z18.s, p4/M, z21.s, z2.s\n"
@@ -492,12 +490,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z19.s, p4/M, z20.s, z2.s\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 35f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p4/Z, [x21]\n"
"ld1rw { z20.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z21.s\n"
"fmin z9.s, p4/M, z9.s, z21.s\n"
@@ -529,24 +527,24 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
"36:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 26b\n"
"b 74f\n"
"37:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"38:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -559,18 +557,18 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cbz x12, 39f\n"
"ld1w { z8.s }, p4/Z, [x12]\n"
"ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -578,13 +576,13 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"39:" // Height 4: no bias
"tbz %x[flags], #0, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x22]\n"
"ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
@@ -619,8 +617,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"42:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 43f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -670,6 +668,7 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z14.s, p4/M, z25.s, z1.s\n"
"fmla z18.s, p4/M, z25.s, z2.s\n"
"fmla z22.s, p4/M, z25.s, z3.s\n"
+ "ld1w { z6.s }, p4/Z, [x10]\n"
"fmla z11.s, p4/M, z24.s, z0.s\n"
"fmla z15.s, p4/M, z24.s, z1.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -678,7 +677,6 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z23.s, p4/M, z24.s, z3.s\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 45b\n"
"46:" // Height 4: Multiply loop: Main loop skip
@@ -689,15 +687,15 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
"ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, p4/M, z25.s, z0.s\n"
"fmla z14.s, p4/M, z25.s, z1.s\n"
- "addvl x10, x10, #4\n"
"fmla z18.s, p4/M, z25.s, z2.s\n"
"fmla z22.s, p4/M, z25.s, z3.s\n"
"fmla z11.s, p4/M, z24.s, z0.s\n"
@@ -706,13 +704,13 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z23.s, p4/M, z24.s, z3.s\n"
"bne 42b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 47f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p4/Z, [x21]\n"
"ld1rw { z24.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z25.s\n"
"fmin z9.s, p4/M, z9.s, z25.s\n"
@@ -752,28 +750,28 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x24]\n"
- "st1w { z21.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x23]\n"
+ "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
"48:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 38b\n"
"b 74f\n"
"49:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"50:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -786,18 +784,18 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cbz x12, 51f\n"
"ld1w { z8.s }, p4/Z, [x12]\n"
"ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -809,16 +807,16 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"51:" // Height 5: no bias
"tbz %x[flags], #0, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x23]\n"
"ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x22]\n"
@@ -859,8 +857,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"54:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 55f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -903,8 +901,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
- "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
@@ -922,12 +920,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
"fmla z15.s, p4/M, z28.s, z1.s\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"fmla z19.s, p4/M, z28.s, z2.s\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"fmla z23.s, p4/M, z28.s, z3.s\n"
- "ld1rw { z3.s }, p4/Z, [x23]\n"
"fmla z27.s, p4/M, z28.s, z4.s\n"
+ "ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 57b\n"
@@ -938,12 +936,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
+ "cmp x28, x20\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
- "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
+ "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
- "cmp x28, x20\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
"ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
@@ -960,14 +958,14 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z27.s, p4/M, z28.s, z4.s\n"
"bne 54b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 59f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z29.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z29.s }, p4/Z, [x21]\n"
"ld1rw { z28.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z29.s\n"
"fmin z9.s, p4/M, z9.s, z29.s\n"
@@ -1015,22 +1013,22 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x24]\n"
- "st1w { z21.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p3, [x23]\n"
- "st1w { z25.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x23]\n"
+ "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p3, [x22]\n"
+ "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
"60:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1038,13 +1036,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"b 74f\n"
"61:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"62:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -1057,18 +1054,18 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cbz x12, 63f\n"
"ld1w { z8.s }, p4/Z, [x12]\n"
"ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1084,17 +1081,17 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"63:" // Height 6: no bias
"tbz %x[flags], #0, 64f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x24]\n"
"ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x23]\n"
@@ -1143,8 +1140,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"mov x28, #0x0\n"
"66:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 67f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1232,12 +1229,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
+ "cmp x28, x20\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
"fmla z28.s, p4/M, z6.s, z5.s\n"
"ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
- "cmp x28, x20\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
@@ -1258,15 +1255,15 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"fmla z31.s, p4/M, z7.s, z5.s\n"
"bne 66b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 71f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p4/Z, [x21]\n"
"ld1rw { z0.s }, p4/Z, [x20]\n"
"fmin z8.s, p4/M, z8.s, z1.s\n"
"fmin z9.s, p4/M, z9.s, z1.s\n"
@@ -1322,26 +1319,26 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x26]\n"
- "st1w { z13.s }, p2, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x24]\n"
- "st1w { z21.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p3, [x23]\n"
- "st1w { z25.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p0, [x23, #3, MUL VL]\n"
- "st1w { z28.s }, p3, [x22]\n"
- "st1w { z29.s }, p2, [x22, #1, MUL VL]\n"
- "st1w { z30.s }, p1, [x22, #2, MUL VL]\n"
- "st1w { z31.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x25]\n"
+ "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x24]\n"
+ "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x23]\n"
+ "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p3, [x22]\n"
+ "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z28.s }, p3, [x21]\n"
+ "st1w { z29.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p0, [x21, #3, MUL VL]\n"
"72:" // Height 6: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1358,8 +1355,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"74:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
index 99828e8f0c..da4670d7e0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -47,19 +47,18 @@ void sve_hybrid_fp32_mla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void sve_hybrid_fp32_mla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -104,10 +102,10 @@ void sve_hybrid_fp32_mla_6x4VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -140,8 +138,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -157,98 +155,98 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "fmla z8.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[0]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.s, z17.s, z0.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.s, z17.s, z0.s[1]\n"
"fmla z11.s, z16.s, z0.s[1]\n"
"ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z17.s, z0.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z17.s, z0.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z17.s, z0.s[3]\n"
- "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"fmla z10.s, z17.s, z0.s[3]\n"
"fmla z11.s, z16.s, z0.s[3]\n"
+ "add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "fmla z8.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.s, z17.s, z0.s[0]\n"
"fmla z11.s, z16.s, z0.s[0]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z17.s, z0.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[1]\n"
"fmla z11.s, z16.s, z0.s[1]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z17.s, z0.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[2]\n"
"fmla z11.s, z16.s, z0.s[2]\n"
+ "addvl x10, x10, #4\n"
"ble 11f\n"
"ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z17.s, z0.s[3]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.s, z17.s, z0.s[3]\n"
"fmla z11.s, z16.s, z0.s[3]\n"
+ "addvl x10, x10, #4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -270,10 +268,10 @@ void sve_hybrid_fp32_mla_6x4VL (
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -286,22 +284,22 @@ void sve_hybrid_fp32_mla_6x4VL (
"cbz x12, 16f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "addvl x12, x12, #4\n"
"b 18f\n"
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
@@ -320,8 +318,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -340,38 +338,38 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z17.s, z1.s[0]\n"
"fmla z12.s, z17.s, z0.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z1.s[0]\n"
"fmla z13.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.s, z17.s, z1.s[0]\n"
"fmla z14.s, z17.s, z0.s[0]\n"
"ld1w { z17.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "cmp x27, #0x4\n"
"fmla z11.s, z16.s, z1.s[0]\n"
"fmla z15.s, z16.s, z0.s[0]\n"
"ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"fmla z8.s, z17.s, z1.s[1]\n"
"fmla z12.s, z17.s, z0.s[1]\n"
"ld1w { z17.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z16.s, z1.s[1]\n"
"fmla z13.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"fmla z10.s, z17.s, z1.s[1]\n"
"fmla z14.s, z17.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z16.s, z1.s[1]\n"
"fmla z15.s, z16.s, z0.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z17.s, z1.s[2]\n"
"fmla z12.s, z17.s, z0.s[2]\n"
@@ -398,50 +396,50 @@ void sve_hybrid_fp32_mla_6x4VL (
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z17.s }, p5/Z, [x10]\n"
- "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z17.s, z0.s[0]\n"
"fmla z12.s, z17.s, z1.s[0]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[0]\n"
"fmla z13.s, z16.s, z1.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.s, z17.s, z0.s[0]\n"
"fmla z14.s, z17.s, z1.s[0]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, z16.s, z0.s[0]\n"
"fmla z15.s, z16.s, z1.s[0]\n"
"ble 24f\n"
"ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z17.s, z0.s[1]\n"
"fmla z12.s, z17.s, z1.s[1]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[1]\n"
"fmla z13.s, z16.s, z1.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[1]\n"
"fmla z14.s, z17.s, z1.s[1]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, z16.s, z0.s[1]\n"
"fmla z15.s, z16.s, z1.s[1]\n"
"ble 24f\n"
"ld1w { z17.s }, p5/Z, [x10]\n"
"ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z17.s, z0.s[2]\n"
"fmla z12.s, z17.s, z1.s[2]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[2]\n"
"fmla z13.s, z16.s, z1.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x1\n"
"fmla z10.s, z17.s, z0.s[2]\n"
"fmla z14.s, z17.s, z1.s[2]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, z16.s, z0.s[2]\n"
"fmla z15.s, z16.s, z1.s[2]\n"
"ble 24f\n"
@@ -449,13 +447,13 @@ void sve_hybrid_fp32_mla_6x4VL (
"ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z17.s, z0.s[3]\n"
"fmla z12.s, z17.s, z1.s[3]\n"
- "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z16.s, z0.s[3]\n"
"fmla z13.s, z16.s, z1.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"fmla z10.s, z17.s, z0.s[3]\n"
"fmla z14.s, z17.s, z1.s[3]\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, z16.s, z0.s[3]\n"
"fmla z15.s, z16.s, z1.s[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
@@ -464,11 +462,11 @@ void sve_hybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 19b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -492,20 +490,20 @@ void sve_hybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -518,27 +516,27 @@ void sve_hybrid_fp32_mla_6x4VL (
"cbz x12, 29f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
"b 31f\n"
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
@@ -565,8 +563,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -588,37 +586,37 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z21.s }, p5/Z, [x10]\n"
- "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z0.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
"fmla z8.s, z21.s, z2.s[0]\n"
"fmla z12.s, z21.s, z1.s[0]\n"
- "fmla z9.s, z20.s, z2.s[0]\n"
- "fmla z13.s, z20.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.s, z21.s, z0.s[0]\n"
+ "fmla z9.s, z20.s, z2.s[0]\n"
"ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
"fmla z17.s, z20.s, z0.s[0]\n"
"ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x27, #0x4\n"
"fmla z10.s, z21.s, z2.s[0]\n"
"fmla z14.s, z21.s, z1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z18.s, z21.s, z0.s[0]\n"
- "ld1w { z21.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z20.s, z2.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"fmla z15.s, z20.s, z1.s[0]\n"
"fmla z19.s, z20.s, z0.s[0]\n"
"ld1w { z20.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z21.s, z2.s[1]\n"
"fmla z12.s, z21.s, z1.s[1]\n"
"fmla z16.s, z21.s, z0.s[1]\n"
- "ld1w { z21.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.s, z20.s, z1.s[1]\n"
"fmla z17.s, z20.s, z0.s[1]\n"
"ld1w { z20.s }, p5/Z, [x10, #7, MUL VL]\n"
@@ -627,31 +625,31 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z14.s, z21.s, z1.s[1]\n"
"fmla z18.s, z21.s, z0.s[1]\n"
"fmla z11.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z15.s, z20.s, z1.s[1]\n"
"fmla z19.s, z20.s, z0.s[1]\n"
- "ld1w { z21.s }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1w { z20.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z21.s, z2.s[2]\n"
"fmla z12.s, z21.s, z1.s[2]\n"
"fmla z16.s, z21.s, z0.s[2]\n"
- "ld1w { z21.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z13.s, z20.s, z1.s[2]\n"
"fmla z17.s, z20.s, z0.s[2]\n"
"ld1w { z20.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z21.s, z2.s[2]\n"
"fmla z14.s, z21.s, z1.s[2]\n"
"fmla z18.s, z21.s, z0.s[2]\n"
- "ld1w { z21.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z15.s, z20.s, z1.s[2]\n"
"fmla z19.s, z20.s, z0.s[2]\n"
"ld1w { z20.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z21.s, z2.s[3]\n"
"fmla z12.s, z21.s, z1.s[3]\n"
"fmla z16.s, z21.s, z0.s[3]\n"
- "ld1w { z21.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z20.s, z2.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z13.s, z20.s, z1.s[3]\n"
"fmla z17.s, z20.s, z0.s[3]\n"
"ld1w { z20.s }, p5/Z, [x10, #-1, MUL VL]\n"
@@ -664,18 +662,18 @@ void sve_hybrid_fp32_mla_6x4VL (
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z21.s }, p5/Z, [x10]\n"
- "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
"fmla z8.s, z21.s, z0.s[0]\n"
"fmla z12.s, z21.s, z1.s[0]\n"
- "fmla z9.s, z20.s, z0.s[0]\n"
- "fmla z13.s, z20.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.s, z21.s, z2.s[0]\n"
+ "fmla z9.s, z20.s, z0.s[0]\n"
"ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
"fmla z17.s, z20.s, z2.s[0]\n"
"ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -688,12 +686,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 37f\n"
"ld1w { z21.s }, p5/Z, [x10]\n"
"ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z21.s, z0.s[1]\n"
"fmla z12.s, z21.s, z1.s[1]\n"
"fmla z16.s, z21.s, z2.s[1]\n"
- "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z20.s, z0.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.s, z20.s, z1.s[1]\n"
"fmla z17.s, z20.s, z2.s[1]\n"
"ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -707,12 +705,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 37f\n"
"ld1w { z21.s }, p5/Z, [x10]\n"
"ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z21.s, z0.s[2]\n"
"fmla z12.s, z21.s, z1.s[2]\n"
"fmla z16.s, z21.s, z2.s[2]\n"
- "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z20.s, z0.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z13.s, z20.s, z1.s[2]\n"
"fmla z17.s, z20.s, z2.s[2]\n"
"ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -729,8 +727,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z8.s, z21.s, z0.s[3]\n"
"fmla z12.s, z21.s, z1.s[3]\n"
"fmla z16.s, z21.s, z2.s[3]\n"
- "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z20.s, z0.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z20.s, z1.s[3]\n"
"fmla z17.s, z20.s, z2.s[3]\n"
"ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -747,12 +745,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p5/Z, [x21]\n"
"ld1rw { z20.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z21.s\n"
"fmin z9.s, p5/M, z9.s, z21.s\n"
@@ -784,24 +782,24 @@ void sve_hybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -814,18 +812,18 @@ void sve_hybrid_fp32_mla_6x4VL (
"cbz x12, 42f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -833,13 +831,13 @@ void sve_hybrid_fp32_mla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
@@ -874,8 +872,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -900,25 +898,25 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z25.s }, p5/Z, [x10]\n"
- "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z3.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z1.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z25.s, z3.s[0]\n"
"fmla z12.s, z25.s, z2.s[0]\n"
- "fmla z9.s, z24.s, z3.s[0]\n"
- "fmla z13.s, z24.s, z2.s[0]\n"
"fmla z16.s, z25.s, z1.s[0]\n"
"fmla z20.s, z25.s, z0.s[0]\n"
"ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z9.s, z24.s, z3.s[0]\n"
+ "fmla z13.s, z24.s, z2.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z17.s, z24.s, z1.s[0]\n"
"fmla z21.s, z24.s, z0.s[0]\n"
"ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -947,9 +945,9 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z14.s, z25.s, z2.s[1]\n"
"fmla z18.s, z25.s, z1.s[1]\n"
"fmla z22.s, z25.s, z0.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z24.s, z3.s[1]\n"
"fmla z15.s, z24.s, z2.s[1]\n"
- "ld1w { z25.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z19.s, z24.s, z1.s[1]\n"
"fmla z23.s, z24.s, z0.s[1]\n"
"ld1w { z24.s }, p5/Z, [x10, #-7, MUL VL]\n"
@@ -994,20 +992,20 @@ void sve_hybrid_fp32_mla_6x4VL (
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z25.s }, p5/Z, [x10]\n"
- "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z25.s, z0.s[0]\n"
"fmla z12.s, z25.s, z1.s[0]\n"
- "fmla z9.s, z24.s, z0.s[0]\n"
- "fmla z13.s, z24.s, z1.s[0]\n"
"fmla z16.s, z25.s, z2.s[0]\n"
"fmla z20.s, z25.s, z3.s[0]\n"
"ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z24.s, z0.s[0]\n"
+ "fmla z13.s, z24.s, z1.s[0]\n"
"fmla z17.s, z24.s, z2.s[0]\n"
"fmla z21.s, z24.s, z3.s[0]\n"
"ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1023,12 +1021,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 50f\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
"ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z25.s, z0.s[1]\n"
"fmla z12.s, z25.s, z1.s[1]\n"
"fmla z16.s, z25.s, z2.s[1]\n"
"fmla z20.s, z25.s, z3.s[1]\n"
"ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.s, z24.s, z0.s[1]\n"
"fmla z13.s, z24.s, z1.s[1]\n"
"fmla z17.s, z24.s, z2.s[1]\n"
@@ -1046,12 +1044,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 50f\n"
"ld1w { z25.s }, p5/Z, [x10]\n"
"ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z25.s, z0.s[2]\n"
"fmla z12.s, z25.s, z1.s[2]\n"
"fmla z16.s, z25.s, z2.s[2]\n"
"fmla z20.s, z25.s, z3.s[2]\n"
"ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x1\n"
"fmla z9.s, z24.s, z0.s[2]\n"
"fmla z13.s, z24.s, z1.s[2]\n"
"fmla z17.s, z24.s, z2.s[2]\n"
@@ -1094,13 +1092,13 @@ void sve_hybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p5/Z, [x21]\n"
"ld1rw { z24.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z25.s\n"
"fmin z9.s, p5/M, z9.s, z25.s\n"
@@ -1140,28 +1138,28 @@ void sve_hybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1174,18 +1172,18 @@ void sve_hybrid_fp32_mla_6x4VL (
"cbz x12, 55f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1197,16 +1195,16 @@ void sve_hybrid_fp32_mla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22]\n"
@@ -1247,8 +1245,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1276,29 +1274,29 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z29.s }, p5/Z, [x10]\n"
- "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z1.s }, p0/Z, [x23]\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"fmla z8.s, z29.s, z4.s[0]\n"
"fmla z12.s, z29.s, z3.s[0]\n"
- "fmla z9.s, z28.s, z4.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.s, z29.s, z2.s[0]\n"
"fmla z20.s, z29.s, z1.s[0]\n"
+ "add x25, x25, #0x10\n"
"fmla z24.s, z29.s, z0.s[0]\n"
- "fmla z13.s, z28.s, z3.s[0]\n"
+ "fmla z9.s, z28.s, z4.s[0]\n"
"ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z28.s, z3.s[0]\n"
"fmla z17.s, z28.s, z2.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z21.s, z28.s, z1.s[0]\n"
"fmla z25.s, z28.s, z0.s[0]\n"
"ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1307,8 +1305,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z18.s, z29.s, z2.s[0]\n"
"fmla z22.s, z29.s, z1.s[0]\n"
"fmla z26.s, z29.s, z0.s[0]\n"
- "ld1w { z29.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z28.s, z4.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z15.s, z28.s, z3.s[0]\n"
"fmla z19.s, z28.s, z2.s[0]\n"
"fmla z23.s, z28.s, z1.s[0]\n"
@@ -1319,8 +1317,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z16.s, z29.s, z2.s[1]\n"
"fmla z20.s, z29.s, z1.s[1]\n"
"fmla z24.s, z29.s, z0.s[1]\n"
- "ld1w { z29.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z13.s, z28.s, z3.s[1]\n"
"fmla z17.s, z28.s, z2.s[1]\n"
"fmla z21.s, z28.s, z1.s[1]\n"
@@ -1333,8 +1331,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z22.s, z29.s, z1.s[1]\n"
"fmla z26.s, z29.s, z0.s[1]\n"
"fmla z11.s, z28.s, z4.s[1]\n"
- "fmla z15.s, z28.s, z3.s[1]\n"
"ld1w { z29.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[1]\n"
"fmla z19.s, z28.s, z2.s[1]\n"
"fmla z23.s, z28.s, z1.s[1]\n"
"fmla z27.s, z28.s, z0.s[1]\n"
@@ -1344,8 +1342,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z16.s, z29.s, z2.s[2]\n"
"fmla z20.s, z29.s, z1.s[2]\n"
"fmla z24.s, z29.s, z0.s[2]\n"
- "ld1w { z29.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z13.s, z28.s, z3.s[2]\n"
"fmla z17.s, z28.s, z2.s[2]\n"
"fmla z21.s, z28.s, z1.s[2]\n"
@@ -1356,8 +1354,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z18.s, z29.s, z2.s[2]\n"
"fmla z22.s, z29.s, z1.s[2]\n"
"fmla z26.s, z29.s, z0.s[2]\n"
- "ld1w { z29.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z15.s, z28.s, z3.s[2]\n"
"fmla z19.s, z28.s, z2.s[2]\n"
"fmla z23.s, z28.s, z1.s[2]\n"
@@ -1368,8 +1366,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z16.s, z29.s, z2.s[3]\n"
"fmla z20.s, z29.s, z1.s[3]\n"
"fmla z24.s, z29.s, z0.s[3]\n"
- "ld1w { z29.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z28.s, z4.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z13.s, z28.s, z3.s[3]\n"
"fmla z17.s, z28.s, z2.s[3]\n"
"fmla z21.s, z28.s, z1.s[3]\n"
@@ -1388,23 +1386,23 @@ void sve_hybrid_fp32_mla_6x4VL (
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z29.s }, p5/Z, [x10]\n"
- "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"fmla z8.s, z29.s, z0.s[0]\n"
"fmla z12.s, z29.s, z1.s[0]\n"
- "fmla z9.s, z28.s, z0.s[0]\n"
- "fmla z13.s, z28.s, z1.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z16.s, z29.s, z2.s[0]\n"
"fmla z20.s, z29.s, z3.s[0]\n"
"fmla z24.s, z29.s, z4.s[0]\n"
- "fmla z17.s, z28.s, z2.s[0]\n"
+ "fmla z9.s, z28.s, z0.s[0]\n"
"ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z2.s[0]\n"
"fmla z21.s, z28.s, z3.s[0]\n"
"fmla z25.s, z28.s, z4.s[0]\n"
"ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1422,21 +1420,21 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 63f\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
"ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z29.s, z0.s[1]\n"
"fmla z12.s, z29.s, z1.s[1]\n"
"fmla z16.s, z29.s, z2.s[1]\n"
"fmla z20.s, z29.s, z3.s[1]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.s, z29.s, z4.s[1]\n"
- "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z28.s, z0.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z28.s, z1.s[1]\n"
"fmla z17.s, z28.s, z2.s[1]\n"
"fmla z21.s, z28.s, z3.s[1]\n"
"fmla z25.s, z28.s, z4.s[1]\n"
"ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z29.s, z0.s[1]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.s, z29.s, z0.s[1]\n"
"fmla z14.s, z29.s, z1.s[1]\n"
"fmla z18.s, z29.s, z2.s[1]\n"
"fmla z22.s, z29.s, z3.s[1]\n"
@@ -1449,21 +1447,21 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 63f\n"
"ld1w { z29.s }, p5/Z, [x10]\n"
"ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z29.s, z0.s[2]\n"
"fmla z12.s, z29.s, z1.s[2]\n"
"fmla z16.s, z29.s, z2.s[2]\n"
"fmla z20.s, z29.s, z3.s[2]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.s, z29.s, z4.s[2]\n"
- "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z28.s, z0.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z28.s, z1.s[2]\n"
"fmla z17.s, z28.s, z2.s[2]\n"
"fmla z21.s, z28.s, z3.s[2]\n"
"fmla z25.s, z28.s, z4.s[2]\n"
"ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z29.s, z0.s[2]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.s, z29.s, z0.s[2]\n"
"fmla z14.s, z29.s, z1.s[2]\n"
"fmla z18.s, z29.s, z2.s[2]\n"
"fmla z22.s, z29.s, z3.s[2]\n"
@@ -1481,8 +1479,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z16.s, z29.s, z2.s[3]\n"
"fmla z20.s, z29.s, z3.s[3]\n"
"fmla z24.s, z29.s, z4.s[3]\n"
- "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z28.s, z0.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z28.s, z1.s[3]\n"
"fmla z17.s, z28.s, z2.s[3]\n"
"fmla z21.s, z28.s, z3.s[3]\n"
@@ -1505,14 +1503,14 @@ void sve_hybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 58b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z29.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z29.s }, p5/Z, [x21]\n"
"ld1rw { z28.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z29.s\n"
"fmin z9.s, p5/M, z9.s, z29.s\n"
@@ -1560,22 +1558,22 @@ void sve_hybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1583,13 +1581,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1602,18 +1599,18 @@ void sve_hybrid_fp32_mla_6x4VL (
"cbz x12, 68f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"mov z12.d, z8.d\n"
"mov z13.d, z9.d\n"
- "mov z16.d, z8.d\n"
- "mov z17.d, z9.d\n"
- "mov z20.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
+ "mov z16.d, z8.d\n"
+ "mov z17.d, z9.d\n"
+ "addvl x12, x12, #4\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
+ "mov z20.d, z8.d\n"
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
@@ -1629,17 +1626,17 @@ void sve_hybrid_fp32_mla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1688,8 +1685,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov x28, #0x0\n"
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1720,29 +1717,29 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1w { z1.s }, p5/Z, [x10]\n"
- "ld1w { z0.s }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z7.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqw { z6.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x4\n"
"ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
"ld1rqw { z3.s }, p0/Z, [x22]\n"
"ld1rqw { z2.s }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1w { z1.s }, p5/Z, [x10]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z1.s, z7.s[0]\n"
"fmla z12.s, z1.s, z6.s[0]\n"
- "add x21, x21, #0x10\n"
"fmla z16.s, z1.s, z5.s[0]\n"
"fmla z20.s, z1.s, z4.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z24.s, z1.s, z3.s[0]\n"
"fmla z28.s, z1.s, z2.s[0]\n"
"ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"fmla z9.s, z0.s, z7.s[0]\n"
"fmla z13.s, z0.s, z6.s[0]\n"
"fmla z17.s, z0.s, z5.s[0]\n"
@@ -1850,24 +1847,24 @@ void sve_hybrid_fp32_mla_6x4VL (
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1w { z7.s }, p5/Z, [x10]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
"ld1rqw { z5.s }, p0/Z, [x21]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.s, z7.s, z0.s[0]\n"
"fmla z12.s, z7.s, z1.s[0]\n"
- "fmla z9.s, z6.s, z0.s[0]\n"
- "fmla z13.s, z6.s, z1.s[0]\n"
"fmla z16.s, z7.s, z2.s[0]\n"
"fmla z20.s, z7.s, z3.s[0]\n"
"fmla z24.s, z7.s, z4.s[0]\n"
"fmla z28.s, z7.s, z5.s[0]\n"
"ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[0]\n"
+ "fmla z13.s, z6.s, z1.s[0]\n"
"fmla z17.s, z6.s, z2.s[0]\n"
"fmla z21.s, z6.s, z3.s[0]\n"
"fmla z25.s, z6.s, z4.s[0]\n"
@@ -1889,23 +1886,23 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 76f\n"
"ld1w { z7.s }, p5/Z, [x10]\n"
"ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z7.s, z0.s[1]\n"
"fmla z12.s, z7.s, z1.s[1]\n"
"fmla z16.s, z7.s, z2.s[1]\n"
"fmla z20.s, z7.s, z3.s[1]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.s, z7.s, z4.s[1]\n"
"fmla z28.s, z7.s, z5.s[1]\n"
- "fmla z9.s, z6.s, z0.s[1]\n"
"ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[1]\n"
"fmla z13.s, z6.s, z1.s[1]\n"
"fmla z17.s, z6.s, z2.s[1]\n"
"fmla z21.s, z6.s, z3.s[1]\n"
"fmla z25.s, z6.s, z4.s[1]\n"
"fmla z29.s, z6.s, z5.s[1]\n"
"ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z7.s, z0.s[1]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.s, z7.s, z0.s[1]\n"
"fmla z14.s, z7.s, z1.s[1]\n"
"fmla z18.s, z7.s, z2.s[1]\n"
"fmla z22.s, z7.s, z3.s[1]\n"
@@ -1920,23 +1917,23 @@ void sve_hybrid_fp32_mla_6x4VL (
"ble 76f\n"
"ld1w { z7.s }, p5/Z, [x10]\n"
"ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x1\n"
"fmla z8.s, z7.s, z0.s[2]\n"
"fmla z12.s, z7.s, z1.s[2]\n"
"fmla z16.s, z7.s, z2.s[2]\n"
"fmla z20.s, z7.s, z3.s[2]\n"
+ "subs x27, x27, #0x1\n"
"fmla z24.s, z7.s, z4.s[2]\n"
"fmla z28.s, z7.s, z5.s[2]\n"
- "fmla z9.s, z6.s, z0.s[2]\n"
"ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[2]\n"
"fmla z13.s, z6.s, z1.s[2]\n"
"fmla z17.s, z6.s, z2.s[2]\n"
"fmla z21.s, z6.s, z3.s[2]\n"
"fmla z25.s, z6.s, z4.s[2]\n"
"fmla z29.s, z6.s, z5.s[2]\n"
"ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z7.s, z0.s[2]\n"
"addvl x10, x10, #4\n"
+ "fmla z10.s, z7.s, z0.s[2]\n"
"fmla z14.s, z7.s, z1.s[2]\n"
"fmla z18.s, z7.s, z2.s[2]\n"
"fmla z22.s, z7.s, z3.s[2]\n"
@@ -1984,15 +1981,15 @@ void sve_hybrid_fp32_mla_6x4VL (
"cmp x28, x20\n"
"bne 71b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p5/Z, [x21]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z1.s\n"
"fmin z9.s, p5/M, z9.s, z1.s\n"
@@ -2048,26 +2045,26 @@ void sve_hybrid_fp32_mla_6x4VL (
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x26]\n"
- "st1w { z13.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x24]\n"
- "st1w { z21.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z28.s }, p4, [x22]\n"
- "st1w { z29.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x25]\n"
+ "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x23]\n"
+ "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -2084,8 +2081,8 @@ void sve_hybrid_fp32_mla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index 5073e9cd7a..06a2d34767 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -70,7 +70,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 1, 1> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 8, 1, 1> transforms = {};
// Default to the generic kernel
kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
index ea3d266ec1..17eaa5641d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
@@ -47,19 +47,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -107,10 +105,10 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"cmp %x[M], #0x2\n"
"bgt 25f\n"
"beq 13f\n"
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
@@ -128,8 +126,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -148,22 +146,22 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
+ "addvl x12, x12, #1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
+ "addvl x12, x12, #1\n"
"bne 6b\n"
"tbz %x[flags], #1, 11f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmax z24.s, p1/M, z24.s, z16.s\n"
@@ -176,23 +174,23 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 2b\n"
"b 98f\n"
"13:" // Height 2
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"14:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 15f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"b 17f\n"
"15:" // Height 2: no bias
"tbz %x[flags], #0, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z24.s }, p0/Z, [x11]\n"
"add x20, x11, x20, LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x11]\n"
"ld1w { z25.s }, p0/Z, [x20]\n"
"b 17f\n"
"16:" // Height 2: no accumulate
@@ -202,8 +200,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -226,28 +224,28 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
- "fmla z25.s, p1/M, z16.s, z1.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"bgt 21b\n"
"22:" // Height 2: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"tbz %x[flags], #1, 23f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -256,33 +254,33 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"23:" // Height 2: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
+ "st1w { z25.s }, p0, [x27]\n"
"24:" // Height 2: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 14b\n"
"b 98f\n"
"25:" // Height 3
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"26:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 27f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"b 29f\n"
"27:" // Height 3: no bias
"tbz %x[flags], #0, 28f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x11, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x20, x11, x21, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
"ld1w { z26.s }, p0/Z, [x20]\n"
"b 29f\n"
"28:" // Height 3: no accumulate
@@ -293,8 +291,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -321,13 +319,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "addvl x12, x12, #1\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
- "ld1rw { z0.s }, p1/Z, [x28]\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
+ "addvl x12, x12, #1\n"
+ "ld1rw { z0.s }, p1/Z, [x28]\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
"bgt 33b\n"
@@ -335,19 +333,19 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
+ "add x26, x27, x20, LSL #2\n"
"tbz %x[flags], #1, 35f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -358,37 +356,37 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"35:" // Height 3: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
- "st1w { z26.s }, p0, [x27]\n"
+ "st1w { z25.s }, p0, [x27]\n"
+ "st1w { z26.s }, p0, [x26]\n"
"36:" // Height 3: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 26b\n"
"b 98f\n"
"37:" // Height 4
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"38:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 39f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"b 41f\n"
"39:" // Height 4: no bias
"tbz %x[flags], #0, 40f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x11, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x20, x11, x21, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"ld1w { z27.s }, p0/Z, [x20]\n"
"b 41f\n"
"40:" // Height 4: no accumulate
@@ -400,8 +398,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"42:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 43f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -432,38 +430,38 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
- "fmla z25.s, p1/M, z16.s, z1.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
- "fmla z26.s, p1/M, z16.s, z2.s\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
- "fmla z27.s, p1/M, z16.s, z3.s\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
"bgt 45b\n"
"46:" // Height 4: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
"fmla z27.s, p1/M, z16.s, z3.s\n"
"bne 42b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
+ "add x25, x26, x20, LSL #2\n"
"tbz %x[flags], #1, 47f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -476,41 +474,41 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"47:" // Height 4: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
- "st1w { z26.s }, p0, [x27]\n"
- "st1w { z27.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x27]\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x25]\n"
"48:" // Height 4: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 38b\n"
"b 98f\n"
"49:" // Height 5
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"50:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 51f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"b 53f\n"
"51:" // Height 5: no bias
"tbz %x[flags], #0, 52f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x11, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x20, x11, x21, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x23]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"ld1w { z28.s }, p0/Z, [x20]\n"
"b 53f\n"
"52:" // Height 5: no accumulate
@@ -523,8 +521,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"54:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 55f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -559,19 +557,19 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "add x25, x25, #0x4\n"
- "add x24, x24, #0x4\n"
- "addvl x12, x12, #1\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
- "ld1rw { z0.s }, p1/Z, [x28]\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
+ "add x25, x25, #0x4\n"
+ "add x24, x24, #0x4\n"
"fmla z27.s, p1/M, z16.s, z3.s\n"
+ "ld1rw { z0.s }, p1/Z, [x28]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
- "fmla z28.s, p1/M, z16.s, z4.s\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
"ld1rw { z4.s }, p1/Z, [x24]\n"
"bgt 57b\n"
@@ -579,23 +577,23 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
"fmla z27.s, p1/M, z16.s, z3.s\n"
"fmla z28.s, p1/M, z16.s, z4.s\n"
"bne 54b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 59f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -610,44 +608,44 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"59:" // Height 5: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
- "st1w { z26.s }, p0, [x27]\n"
- "st1w { z27.s }, p0, [x26]\n"
- "st1w { z28.s }, p0, [x25]\n"
+ "st1w { z25.s }, p0, [x27]\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x25]\n"
+ "st1w { z28.s }, p0, [x24]\n"
"60:" // Height 5: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 50b\n"
"b 98f\n"
"61:" // Height 6
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"62:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 63f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
"b 65f\n"
"63:" // Height 6: no bias
"tbz %x[flags], #0, 64f\n"
- "ldr x22, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x11, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x20, x11, x22, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, x22, LSL #2\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x23]\n"
"ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, x22, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x21, x20, x22, LSL #2\n"
- "add x20, x21, x22, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x22]\n"
"ld1w { z28.s }, p0/Z, [x21]\n"
"ld1w { z29.s }, p0/Z, [x20]\n"
"b 65f\n"
@@ -662,8 +660,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"66:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 67f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -702,48 +700,48 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"add x23, x23, #0x4\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
- "fmla z25.s, p1/M, z16.s, z1.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
- "fmla z26.s, p1/M, z16.s, z2.s\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
- "fmla z27.s, p1/M, z16.s, z3.s\n"
- "fmla z28.s, p1/M, z16.s, z4.s\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
"ld1rw { z4.s }, p1/Z, [x24]\n"
- "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z5.s }, p1/Z, [x23]\n"
"bgt 69b\n"
"70:" // Height 6: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
"fmla z27.s, p1/M, z16.s, z3.s\n"
"fmla z28.s, p1/M, z16.s, z4.s\n"
"fmla z29.s, p1/M, z16.s, z5.s\n"
"bne 66b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 71f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -760,29 +758,29 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"71:" // Height 6: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
- "st1w { z26.s }, p0, [x27]\n"
- "st1w { z27.s }, p0, [x26]\n"
- "st1w { z28.s }, p0, [x25]\n"
- "st1w { z29.s }, p0, [x24]\n"
+ "st1w { z25.s }, p0, [x27]\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x25]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x23]\n"
"72:" // Height 6: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 62b\n"
"b 98f\n"
"73:" // Height 7
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"74:" // Height 7: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 75f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
@@ -790,17 +788,17 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 77f\n"
"75:" // Height 7: no bias
"tbz %x[flags], #0, 76f\n"
- "ldr x23, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x11, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x20, x11, x23, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
+ "add x23, x20, x24, LSL #2\n"
+ "add x22, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
"ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x22, x20, x23, LSL #2\n"
- "add x21, x22, x23, LSL #2\n"
- "add x20, x21, x23, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x23]\n"
"ld1w { z28.s }, p0/Z, [x22]\n"
"ld1w { z29.s }, p0/Z, [x21]\n"
"ld1w { z30.s }, p0/Z, [x20]\n"
@@ -817,8 +815,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"78:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 79f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -861,25 +859,25 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "ld1rw { z0.s }, p1/Z, [x28]\n"
"add x23, x23, #0x4\n"
"add x22, x22, #0x4\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
- "ld1rw { z0.s }, p1/Z, [x28]\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"addvl x12, x12, #1\n"
- "fmla z25.s, p1/M, z16.s, z1.s\n"
- "fmla z26.s, p1/M, z16.s, z2.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
- "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z30.s, p1/M, z16.s, z6.s\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
- "fmla z28.s, p1/M, z16.s, z4.s\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
- "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z4.s }, p1/Z, [x24]\n"
- "fmla z30.s, p1/M, z16.s, z6.s\n"
"ld1rw { z5.s }, p1/Z, [x23]\n"
"ld1rw { z6.s }, p1/Z, [x22]\n"
"bgt 81b\n"
@@ -887,10 +885,10 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
"fmla z27.s, p1/M, z16.s, z3.s\n"
"fmla z28.s, p1/M, z16.s, z4.s\n"
@@ -898,16 +896,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"fmla z30.s, p1/M, z16.s, z6.s\n"
"bne 78b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 83f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -926,12 +924,12 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"83:" // Height 7: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
- "st1w { z26.s }, p0, [x27]\n"
- "st1w { z27.s }, p0, [x26]\n"
- "st1w { z28.s }, p0, [x25]\n"
- "st1w { z29.s }, p0, [x24]\n"
- "st1w { z30.s }, p0, [x23]\n"
+ "st1w { z25.s }, p0, [x27]\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x25]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x22]\n"
"84:" // Height 7: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
@@ -939,21 +937,20 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 98f\n"
"85:" // Height 8
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x20\n"
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x11\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"86:" // Height 8: Column loop
"mov x20, #0x0\n"
"whilelt p0.s, x20, x13\n"
"cbz x14, 87f\n"
"ld1w { z24.s }, p1/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
@@ -962,20 +959,20 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 89f\n"
"87:" // Height 8: no bias
"tbz %x[flags], #0, 88f\n"
- "ldr x23, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x11, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x20, x11, x23, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x22, x20, x23, LSL #2\n"
- "add x20, x22, x23, LSL #2\n"
- "add x21, x20, x23, LSL #2\n"
- "ld1w { z28.s }, p0/Z, [x22]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
- "add x20, x21, x23, LSL #2\n"
+ "add x23, x21, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x23]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
"ld1w { z30.s }, p0/Z, [x21]\n"
"ld1w { z31.s }, p0/Z, [x20]\n"
"b 89f\n"
@@ -992,8 +989,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"mov x10, #0x0\n"
"90:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 91f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1040,28 +1037,28 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"add x23, x23, #0x4\n"
"add x22, x22, #0x4\n"
- "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"add x21, x21, #0x4\n"
"addvl x12, x12, #1\n"
- "fmla z25.s, p1/M, z16.s, z1.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
- "fmla z26.s, p1/M, z16.s, z2.s\n"
+ "fmla z30.s, p1/M, z16.s, z6.s\n"
+ "fmla z31.s, p1/M, z16.s, z7.s\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
- "fmla z27.s, p1/M, z16.s, z3.s\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
- "fmla z28.s, p1/M, z16.s, z4.s\n"
"ld1rw { z4.s }, p1/Z, [x24]\n"
- "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z5.s }, p1/Z, [x23]\n"
- "fmla z30.s, p1/M, z16.s, z6.s\n"
- "fmla z31.s, p1/M, z16.s, z7.s\n"
"ld1rw { z6.s }, p1/Z, [x22]\n"
"ld1rw { z7.s }, p1/Z, [x21]\n"
"bgt 93b\n"
@@ -1069,10 +1066,10 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
- "addvl x12, x12, #1\n"
"cmp x10, x20\n"
"fmla z24.s, p1/M, z16.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z1.s\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, p1/M, z16.s, z2.s\n"
"fmla z27.s, p1/M, z16.s, z3.s\n"
"fmla z28.s, p1/M, z16.s, z4.s\n"
@@ -1081,17 +1078,17 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"fmla z31.s, p1/M, z16.s, z7.s\n"
"bne 90b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 95f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p1/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p1/Z, [x21]\n"
"ld1rw { z16.s }, p1/Z, [x20]\n"
"fmin z24.s, p1/M, z24.s, z17.s\n"
"fmin z25.s, p1/M, z25.s, z17.s\n"
@@ -1112,13 +1109,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"95:" // Height 8: No activation
"st1w { z24.s }, p0, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p0, [x28]\n"
- "st1w { z26.s }, p0, [x27]\n"
- "st1w { z27.s }, p0, [x26]\n"
- "st1w { z28.s }, p0, [x25]\n"
- "st1w { z29.s }, p0, [x24]\n"
- "st1w { z30.s }, p0, [x23]\n"
- "st1w { z31.s }, p0, [x22]\n"
+ "st1w { z25.s }, p0, [x27]\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x25]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x21]\n"
"96:" // Height 8: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
@@ -1135,8 +1132,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"98:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index edfb3c0828..b79934094e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -47,19 +47,18 @@ void sve_hybrid_fp32_mla_8x1VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -80,7 +79,6 @@ void sve_hybrid_fp32_mla_8x1VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -107,10 +105,10 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
@@ -128,8 +126,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -145,50 +143,50 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
- "sub x9, x9, #0x4\n"
- "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1rqw { z0.s }, p0/Z, [x28]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
"ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "sub x9, x9, #0x4\n"
"cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
- "fmla z24.s, z19.s, z0.s[0]\n"
- "fmla z24.s, z18.s, z0.s[1]\n"
- "fmla z24.s, z17.s, z0.s[2]\n"
"fmla z24.s, z16.s, z0.s[3]\n"
+ "add x28, x28, #0x10\n"
+ "addvl x12, x12, #4\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
+ "ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
+ "addvl x12, x12, #1\n"
"ble 11f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
+ "addvl x12, x12, #1\n"
"ble 11f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
+ "addvl x12, x12, #1\n"
"ble 11f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmax z24.s, p2/M, z24.s, z16.s\n"
@@ -201,23 +199,23 @@ void sve_hybrid_fp32_mla_8x1VL (
"bgt 2b\n"
"b 106f\n"
"14:" // Height 2
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 16f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"b 18f\n"
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z24.s }, p1/Z, [x11]\n"
"add x20, x11, x20, LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x11]\n"
"ld1w { z25.s }, p1/Z, [x20]\n"
"b 18f\n"
"17:" // Height 2: no accumulate
@@ -227,8 +225,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -247,63 +245,63 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z1.s[0]\n"
+ "fmla z25.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z24.s, z16.s, z1.s[1]\n"
+ "fmla z25.s, z16.s, z0.s[1]\n"
"ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
"ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
- "ld1rqw { z0.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
- "fmla z24.s, z19.s, z1.s[0]\n"
- "fmla z25.s, z19.s, z0.s[0]\n"
- "fmla z24.s, z18.s, z1.s[1]\n"
- "fmla z25.s, z18.s, z0.s[1]\n"
"fmla z24.s, z17.s, z1.s[2]\n"
"fmla z25.s, z17.s, z0.s[2]\n"
+ "cmp x9, #0x4\n"
+ "add x28, x28, #0x10\n"
"fmla z24.s, z16.s, z1.s[3]\n"
"fmla z25.s, z16.s, z0.s[3]\n"
+ "add x27, x27, #0x10\n"
+ "addvl x12, x12, #4\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
+ "addvl x12, x12, #1\n"
"ble 24f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
"ble 24f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
"ble 24f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
"bne 19b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -312,33 +310,33 @@ void sve_hybrid_fp32_mla_8x1VL (
"25:" // Height 2: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
+ "st1w { z25.s }, p1, [x27]\n"
"26:" // Height 2: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 15b\n"
"b 106f\n"
"27:" // Height 3
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 29f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"b 31f\n"
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x11, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x20, x11, x21, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"ld1w { z26.s }, p1/Z, [x20]\n"
"b 31f\n"
"30:" // Height 3: no accumulate
@@ -349,8 +347,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -372,62 +370,62 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1rqw { z2.s }, p0/Z, [x28]\n"
+ "ld1rqw { z1.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x26]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z2.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
+ "fmla z26.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z24.s, z16.s, z2.s[1]\n"
"ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z0.s[1]\n"
"ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
- "ld1rqw { z2.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "fmla z24.s, z19.s, z2.s[0]\n"
- "fmla z25.s, z19.s, z1.s[0]\n"
- "fmla z26.s, z19.s, z0.s[0]\n"
- "fmla z24.s, z18.s, z2.s[1]\n"
- "fmla z25.s, z18.s, z1.s[1]\n"
- "fmla z26.s, z18.s, z0.s[1]\n"
"fmla z24.s, z17.s, z2.s[2]\n"
"fmla z25.s, z17.s, z1.s[2]\n"
+ "add x28, x28, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z26.s, z17.s, z0.s[2]\n"
"fmla z24.s, z16.s, z2.s[3]\n"
+ "add x26, x26, #0x10\n"
+ "addvl x12, x12, #4\n"
"fmla z25.s, z16.s, z1.s[3]\n"
"fmla z26.s, z16.s, z0.s[3]\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
"fmla z26.s, z16.s, z2.s[0]\n"
+ "addvl x12, x12, #1\n"
"ble 37f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
"fmla z26.s, z16.s, z2.s[1]\n"
+ "addvl x12, x12, #1\n"
"ble 37f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
"fmla z26.s, z16.s, z2.s[2]\n"
+ "addvl x12, x12, #1\n"
"ble 37f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[3]\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -435,12 +433,12 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp x10, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
+ "add x26, x27, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -451,37 +449,37 @@ void sve_hybrid_fp32_mla_8x1VL (
"38:" // Height 3: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
- "st1w { z26.s }, p1, [x27]\n"
+ "st1w { z25.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x26]\n"
"39:" // Height 3: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 28b\n"
"b 106f\n"
"40:" // Height 4
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 42f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"b 44f\n"
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x11, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x20, x11, x21, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"ld1w { z27.s }, p1/Z, [x20]\n"
"b 44f\n"
"43:" // Height 4: no accumulate
@@ -493,8 +491,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -519,29 +517,29 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
- "sub x9, x9, #0x4\n"
- "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
"ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
"ld1rqw { z2.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
+ "sub x9, x9, #0x4\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
"ld1rqw { z0.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
- "fmla z24.s, z19.s, z3.s[0]\n"
- "fmla z25.s, z19.s, z2.s[0]\n"
- "fmla z26.s, z19.s, z1.s[0]\n"
- "fmla z27.s, z19.s, z0.s[0]\n"
+ "cmp x9, #0x4\n"
+ "add x28, x28, #0x10\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z3.s[0]\n"
+ "fmla z25.s, z16.s, z2.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z26.s, z16.s, z1.s[0]\n"
+ "fmla z27.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"fmla z24.s, z18.s, z3.s[1]\n"
"fmla z25.s, z18.s, z2.s[1]\n"
+ "add x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z18.s, z1.s[1]\n"
"fmla z27.s, z18.s, z0.s[1]\n"
+ "add x25, x25, #0x10\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z17.s, z3.s[2]\n"
"fmla z25.s, z17.s, z2.s[2]\n"
"fmla z26.s, z17.s, z1.s[2]\n"
@@ -553,38 +551,38 @@ void sve_hybrid_fp32_mla_8x1VL (
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[0]\n"
"fmla z27.s, z16.s, z3.s[0]\n"
"ble 50f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
"fmla z26.s, z16.s, z2.s[1]\n"
"fmla z27.s, z16.s, z3.s[1]\n"
+ "addvl x12, x12, #1\n"
"ble 50f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
"fmla z26.s, z16.s, z2.s[2]\n"
"fmla z27.s, z16.s, z3.s[2]\n"
+ "addvl x12, x12, #1\n"
"ble 50f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[3]\n"
"fmla z27.s, z16.s, z3.s[3]\n"
"50:" // Height 4: Multiply loop: multiply skip
@@ -593,13 +591,13 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp x10, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
+ "add x25, x26, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -612,41 +610,41 @@ void sve_hybrid_fp32_mla_8x1VL (
"51:" // Height 4: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
- "st1w { z26.s }, p1, [x27]\n"
- "st1w { z27.s }, p1, [x26]\n"
+ "st1w { z25.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z27.s }, p1, [x25]\n"
"52:" // Height 4: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 41b\n"
"b 106f\n"
"53:" // Height 5
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 55f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"b 57f\n"
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x11, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x20, x11, x21, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- "add x20, x20, x21, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x23]\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"ld1w { z28.s }, p1/Z, [x20]\n"
"b 57f\n"
"56:" // Height 5: no accumulate
@@ -659,8 +657,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -688,33 +686,33 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
- "sub x9, x9, #0x4\n"
- "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
"ld1rqw { z4.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
"ld1rqw { z3.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
+ "sub x9, x9, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
+ "cmp x9, #0x4\n"
+ "add x28, x28, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "fmla z24.s, z19.s, z4.s[0]\n"
- "fmla z25.s, z19.s, z3.s[0]\n"
- "fmla z26.s, z19.s, z2.s[0]\n"
- "fmla z27.s, z19.s, z1.s[0]\n"
- "fmla z28.s, z19.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z4.s[0]\n"
+ "fmla z25.s, z16.s, z3.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z1.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
"fmla z24.s, z18.s, z4.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z18.s, z3.s[1]\n"
"fmla z26.s, z18.s, z2.s[1]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z27.s, z18.s, z1.s[1]\n"
"fmla z28.s, z18.s, z0.s[1]\n"
+ "add x24, x24, #0x10\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z17.s, z4.s[2]\n"
"fmla z25.s, z17.s, z3.s[2]\n"
"fmla z26.s, z17.s, z2.s[2]\n"
@@ -728,42 +726,42 @@ void sve_hybrid_fp32_mla_8x1VL (
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"ld1rqw { z4.s }, p0/Z, [x24]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
"fmla z26.s, z16.s, z2.s[0]\n"
"fmla z27.s, z16.s, z3.s[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[0]\n"
"ble 63f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
"fmla z26.s, z16.s, z2.s[1]\n"
"fmla z27.s, z16.s, z3.s[1]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[1]\n"
"ble 63f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
"fmla z26.s, z16.s, z2.s[2]\n"
"fmla z27.s, z16.s, z3.s[2]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[2]\n"
"ble 63f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[3]\n"
"fmla z27.s, z16.s, z3.s[3]\n"
"fmla z28.s, z16.s, z4.s[3]\n"
@@ -773,14 +771,14 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp x10, x20\n"
"bne 58b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -795,44 +793,44 @@ void sve_hybrid_fp32_mla_8x1VL (
"64:" // Height 5: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
- "st1w { z26.s }, p1, [x27]\n"
- "st1w { z27.s }, p1, [x26]\n"
- "st1w { z28.s }, p1, [x25]\n"
+ "st1w { z25.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ "st1w { z28.s }, p1, [x24]\n"
"65:" // Height 5: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 54b\n"
"b 106f\n"
"66:" // Height 6
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 68f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
"b 70f\n"
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
- "ldr x22, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x11, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x20, x11, x22, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, x22, LSL #2\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x23]\n"
"ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, x22, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- "add x21, x20, x22, LSL #2\n"
- "add x20, x21, x22, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x22]\n"
"ld1w { z28.s }, p1/Z, [x21]\n"
"ld1w { z29.s }, p1/Z, [x20]\n"
"b 70f\n"
@@ -847,8 +845,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -879,33 +877,33 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
- "sub x9, x9, #0x4\n"
- "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
"ld1rqw { z5.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
"ld1rqw { z4.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
+ "sub x9, x9, #0x4\n"
"ld1rqw { z3.s }, p0/Z, [x26]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
+ "cmp x9, #0x4\n"
+ "add x28, x28, #0x10\n"
"ld1rqw { z1.s }, p0/Z, [x24]\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
"fmla z24.s, z19.s, z5.s[0]\n"
"fmla z25.s, z19.s, z4.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
"fmla z26.s, z19.s, z3.s[0]\n"
"fmla z27.s, z19.s, z2.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"fmla z28.s, z19.s, z1.s[0]\n"
"fmla z29.s, z19.s, z0.s[0]\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z24.s, z18.s, z5.s[1]\n"
"fmla z25.s, z18.s, z4.s[1]\n"
+ "add x23, x23, #0x10\n"
+ "addvl x12, x12, #4\n"
"fmla z26.s, z18.s, z3.s[1]\n"
"fmla z27.s, z18.s, z2.s[1]\n"
"fmla z28.s, z18.s, z1.s[1]\n"
@@ -925,17 +923,17 @@ void sve_hybrid_fp32_mla_8x1VL (
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z5.s }, p0/Z, [x23]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[0]\n"
"fmla z27.s, z16.s, z3.s[0]\n"
"fmla z28.s, z16.s, z4.s[0]\n"
@@ -943,28 +941,28 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 76f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
"fmla z26.s, z16.s, z2.s[1]\n"
"fmla z27.s, z16.s, z3.s[1]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[1]\n"
"fmla z29.s, z16.s, z5.s[1]\n"
"ble 76f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
"fmla z26.s, z16.s, z2.s[2]\n"
"fmla z27.s, z16.s, z3.s[2]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[2]\n"
"fmla z29.s, z16.s, z5.s[2]\n"
"ble 76f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[3]\n"
"fmla z27.s, z16.s, z3.s[3]\n"
"fmla z28.s, z16.s, z4.s[3]\n"
@@ -975,15 +973,15 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp x10, x20\n"
"bne 71b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -1000,29 +998,29 @@ void sve_hybrid_fp32_mla_8x1VL (
"77:" // Height 6: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
- "st1w { z26.s }, p1, [x27]\n"
- "st1w { z27.s }, p1, [x26]\n"
- "st1w { z28.s }, p1, [x25]\n"
- "st1w { z29.s }, p1, [x24]\n"
+ "st1w { z25.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ "st1w { z28.s }, p1, [x24]\n"
+ "st1w { z29.s }, p1, [x23]\n"
"78:" // Height 6: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
"bgt 67b\n"
"b 106f\n"
"79:" // Height 7
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
"80:" // Height 7: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 81f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
@@ -1030,17 +1028,17 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 83f\n"
"81:" // Height 7: no bias
"tbz %x[flags], #0, 82f\n"
- "ldr x23, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x11, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x20, x11, x23, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
+ "add x23, x20, x24, LSL #2\n"
+ "add x22, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- "add x22, x20, x23, LSL #2\n"
- "add x21, x22, x23, LSL #2\n"
- "add x20, x21, x23, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
"ld1w { z28.s }, p1/Z, [x22]\n"
"ld1w { z29.s }, p1/Z, [x21]\n"
"ld1w { z30.s }, p1/Z, [x20]\n"
@@ -1057,8 +1055,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"84:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 85f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1092,37 +1090,37 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 88f\n"
"87:" // Height 7: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
- "sub x9, x9, #0x4\n"
- "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
"ld1rqw { z6.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
"ld1rqw { z5.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
+ "sub x9, x9, #0x4\n"
"ld1rqw { z4.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
+ "cmp x9, #0x4\n"
+ "add x28, x28, #0x10\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z1.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x22]\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
"fmla z24.s, z19.s, z6.s[0]\n"
"fmla z25.s, z19.s, z5.s[0]\n"
- "add x22, x22, #0x10\n"
"fmla z26.s, z19.s, z4.s[0]\n"
"fmla z27.s, z19.s, z3.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
"fmla z28.s, z19.s, z2.s[0]\n"
"fmla z29.s, z19.s, z1.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z30.s, z19.s, z0.s[0]\n"
"fmla z24.s, z18.s, z6.s[1]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z25.s, z18.s, z5.s[1]\n"
"fmla z26.s, z18.s, z4.s[1]\n"
+ "add x22, x22, #0x10\n"
+ "addvl x12, x12, #4\n"
"fmla z27.s, z18.s, z3.s[1]\n"
"fmla z28.s, z18.s, z2.s[1]\n"
"fmla z29.s, z18.s, z1.s[1]\n"
@@ -1144,50 +1142,50 @@ void sve_hybrid_fp32_mla_8x1VL (
"bgt 87b\n"
"88:" // Height 7: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z5.s }, p0/Z, [x23]\n"
"ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
"fmla z26.s, z16.s, z2.s[0]\n"
"fmla z27.s, z16.s, z3.s[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[0]\n"
"fmla z29.s, z16.s, z5.s[0]\n"
"fmla z30.s, z16.s, z6.s[0]\n"
"ble 89f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
"fmla z26.s, z16.s, z2.s[1]\n"
"fmla z27.s, z16.s, z3.s[1]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[1]\n"
"fmla z29.s, z16.s, z5.s[1]\n"
"fmla z30.s, z16.s, z6.s[1]\n"
"ble 89f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
"fmla z26.s, z16.s, z2.s[2]\n"
"fmla z27.s, z16.s, z3.s[2]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[2]\n"
"fmla z29.s, z16.s, z5.s[2]\n"
"fmla z30.s, z16.s, z6.s[2]\n"
"ble 89f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[3]\n"
"fmla z27.s, z16.s, z3.s[3]\n"
"fmla z28.s, z16.s, z4.s[3]\n"
@@ -1199,16 +1197,16 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp x10, x20\n"
"bne 84b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 90f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -1227,12 +1225,12 @@ void sve_hybrid_fp32_mla_8x1VL (
"90:" // Height 7: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
- "st1w { z26.s }, p1, [x27]\n"
- "st1w { z27.s }, p1, [x26]\n"
- "st1w { z28.s }, p1, [x25]\n"
- "st1w { z29.s }, p1, [x24]\n"
- "st1w { z30.s }, p1, [x23]\n"
+ "st1w { z25.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ "st1w { z28.s }, p1, [x24]\n"
+ "st1w { z29.s }, p1, [x23]\n"
+ "st1w { z30.s }, p1, [x22]\n"
"91:" // Height 7: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
@@ -1240,21 +1238,20 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 106f\n"
"92:" // Height 8
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x20\n"
- "ldr x14, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x14, %x[bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x11\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"93:" // Height 8: Column loop
"mov x20, #0x0\n"
"whilelt p1.s, x20, x13\n"
"cbz x14, 94f\n"
"ld1w { z24.s }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
"mov z25.d, z24.d\n"
"mov z26.d, z24.d\n"
+ "addvl x14, x14, #1\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
@@ -1263,20 +1260,20 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 96f\n"
"94:" // Height 8: no bias
"tbz %x[flags], #0, 95f\n"
- "ldr x23, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x11, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x20, x11, x23, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, x23, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- "add x22, x20, x23, LSL #2\n"
- "add x20, x22, x23, LSL #2\n"
- "add x21, x20, x23, LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x22]\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
- "add x20, x21, x23, LSL #2\n"
+ "add x23, x21, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
"ld1w { z30.s }, p1/Z, [x21]\n"
"ld1w { z31.s }, p1/Z, [x20]\n"
"b 96f\n"
@@ -1293,8 +1290,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov x10, #0x0\n"
"97:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 98f\n"
"ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1331,37 +1328,37 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 101f\n"
"100:" // Height 8: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
- "sub x9, x9, #0x4\n"
- "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x9, #0x4\n"
- "addvl x12, x12, #4\n"
"ld1rqw { z7.s }, p0/Z, [x28]\n"
- "add x28, x28, #0x10\n"
"ld1rqw { z6.s }, p0/Z, [x27]\n"
- "add x27, x27, #0x10\n"
+ "sub x9, x9, #0x4\n"
"ld1rqw { z5.s }, p0/Z, [x26]\n"
"ld1rqw { z4.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "add x25, x25, #0x10\n"
+ "cmp x9, #0x4\n"
+ "add x28, x28, #0x10\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
"ld1rqw { z2.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqw { z1.s }, p0/Z, [x22]\n"
"ld1rqw { z0.s }, p0/Z, [x21]\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
"fmla z24.s, z19.s, z7.s[0]\n"
"fmla z25.s, z19.s, z6.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
"fmla z26.s, z19.s, z5.s[0]\n"
"fmla z27.s, z19.s, z4.s[0]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"fmla z28.s, z19.s, z3.s[0]\n"
"fmla z29.s, z19.s, z2.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z30.s, z19.s, z1.s[0]\n"
"fmla z31.s, z19.s, z0.s[0]\n"
+ "add x21, x21, #0x10\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z18.s, z7.s[1]\n"
"fmla z25.s, z18.s, z6.s[1]\n"
"fmla z26.s, z18.s, z5.s[1]\n"
@@ -1389,19 +1386,19 @@ void sve_hybrid_fp32_mla_8x1VL (
"bgt 100b\n"
"101:" // Height 8: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
- "ld1w { z16.s }, p2/Z, [x12]\n"
- "subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z5.s }, p0/Z, [x23]\n"
"ld1rqw { z6.s }, p0/Z, [x22]\n"
"ld1rqw { z7.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"fmla z24.s, z16.s, z0.s[0]\n"
"fmla z25.s, z16.s, z1.s[0]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[0]\n"
"fmla z27.s, z16.s, z3.s[0]\n"
"fmla z28.s, z16.s, z4.s[0]\n"
@@ -1411,11 +1408,11 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 102f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[1]\n"
"fmla z25.s, z16.s, z1.s[1]\n"
"fmla z26.s, z16.s, z2.s[1]\n"
"fmla z27.s, z16.s, z3.s[1]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[1]\n"
"fmla z29.s, z16.s, z5.s[1]\n"
"fmla z30.s, z16.s, z6.s[1]\n"
@@ -1423,20 +1420,20 @@ void sve_hybrid_fp32_mla_8x1VL (
"ble 102f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[2]\n"
"fmla z25.s, z16.s, z1.s[2]\n"
"fmla z26.s, z16.s, z2.s[2]\n"
"fmla z27.s, z16.s, z3.s[2]\n"
+ "addvl x12, x12, #1\n"
"fmla z28.s, z16.s, z4.s[2]\n"
"fmla z29.s, z16.s, z5.s[2]\n"
"fmla z30.s, z16.s, z6.s[2]\n"
"fmla z31.s, z16.s, z7.s[2]\n"
"ble 102f\n"
"ld1w { z16.s }, p2/Z, [x12]\n"
- "addvl x12, x12, #1\n"
"fmla z24.s, z16.s, z0.s[3]\n"
"fmla z25.s, z16.s, z1.s[3]\n"
+ "addvl x12, x12, #1\n"
"fmla z26.s, z16.s, z2.s[3]\n"
"fmla z27.s, z16.s, z3.s[3]\n"
"fmla z28.s, z16.s, z4.s[3]\n"
@@ -1449,17 +1446,17 @@ void sve_hybrid_fp32_mla_8x1VL (
"cmp x10, x20\n"
"bne 97b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x28, x11, x20, LSL #2\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x11, x20, LSL #2\n"
"add x26, x27, x20, LSL #2\n"
"add x25, x26, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"tbz %x[flags], #1, 103f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p2/Z, [x21]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"fmin z24.s, p2/M, z24.s, z17.s\n"
"fmin z25.s, p2/M, z25.s, z17.s\n"
@@ -1480,13 +1477,13 @@ void sve_hybrid_fp32_mla_8x1VL (
"103:" // Height 8: No activation
"st1w { z24.s }, p1, [x11]\n"
"addvl x11, x11, #1\n"
- "st1w { z25.s }, p1, [x28]\n"
- "st1w { z26.s }, p1, [x27]\n"
- "st1w { z27.s }, p1, [x26]\n"
- "st1w { z28.s }, p1, [x25]\n"
- "st1w { z29.s }, p1, [x24]\n"
- "st1w { z30.s }, p1, [x23]\n"
- "st1w { z31.s }, p1, [x22]\n"
+ "st1w { z25.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ "st1w { z28.s }, p1, [x24]\n"
+ "st1w { z29.s }, p1, [x23]\n"
+ "st1w { z30.s }, p1, [x22]\n"
+ "st1w { z31.s }, p1, [x21]\n"
"104:" // Height 8: Writeback done
"decw x13\n"
"cmp x13, XZR\n"
@@ -1503,8 +1500,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"106:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index d7436b15f4..7dc786fd66 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 12, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 4, 12, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
index be245f9ecc..b0d6f756d7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -48,19 +48,18 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -102,10 +100,10 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p6.s, x20, x9\n"
@@ -122,19 +120,19 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x10, 3f\n"
"ld1w { z8.s }, p7/Z, [x10]\n"
"ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
- "addvl x10, x10, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x10, x10, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -142,16 +140,16 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"b 5f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 4f\n"
- "ld1w { z25.s }, p6/Z, [x27]\n"
- "ld1w { z24.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x27]\n"
+ "ld1w { z20.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "zip1 z8.d, z21.d, z14.d\n"
+ "zip2 z14.d, z21.d, z14.d\n"
"ld1w { z23.s }, p4/Z, [x27, #2, MUL VL]\n"
"ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "zip1 z9.d, z20.d, z15.d\n"
+ "zip2 z15.d, z20.d, z15.d\n"
"ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
- "zip1 z8.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
"zip1 z10.d, z23.d, z16.d\n"
"zip2 z16.d, z23.d, z16.d\n"
"zip1 z11.d, z22.d, z17.d\n"
@@ -178,8 +176,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -195,70 +193,70 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
- "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
- "cmp x25, #0x4\n"
"ld1rqw { z24.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
"uzp1 z24.h, z24.h, z24.h\n"
- ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
- "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+ "ld1h { z21.h }, p7/Z, [x28]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e708 // bfmmla z8.s, z24.h, z21.h\n"
+ ".inst 0x6474e70e // bfmmla z14.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
- "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6475e70a // bfmmla z10.s, z24.h, z21.h\n"
+ ".inst 0x6474e710 // bfmmla z16.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
"ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
- ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
"ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
"ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ "sub x25, x25, #0x4\n"
"ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
"ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
- "addvl x28, x28, #-4\n"
+ "cmp x25, #0x4\n"
".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
+ "add x24, x24, #0x10\n"
+ "addvl x28, x28, #-4\n"
".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
- "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "ld1rqw { z23.s }, p0/Z, [x24]\n"
+ ".inst 0x658abef7 // bfcvt z23.h, p7/M, z23.s\n"
+ "uzp1 z23.h, z23.h, z23.h\n"
+ "ld1h { z21.h }, p7/Z, [x28]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e6e8 // bfmmla z8.s, z23.h, z21.h\n"
+ ".inst 0x6474e6ee // bfmmla z14.s, z23.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
"ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
- "ld1rqw { z24.s }, p0/Z, [x24]\n"
- ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
- "uzp1 z24.h, z24.h, z24.h\n"
- ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
- "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6475e6e9 // bfmmla z9.s, z23.h, z21.h\n"
+ ".inst 0x6474e6ef // bfmmla z15.s, z23.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6475e6ea // bfmmla z10.s, z23.h, z21.h\n"
+ ".inst 0x6474e6f0 // bfmmla z16.s, z23.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
"ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
- ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
- ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
- ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6475e6eb // bfmmla z11.s, z23.h, z21.h\n"
+ ".inst 0x6474e6f1 // bfmmla z17.s, z23.h, z20.h\n"
+ "ld1h { z20.h }, p7/Z, [x28, #-8, MUL VL]\n"
"ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6474e6ec // bfmmla z12.s, z23.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
"ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6476e6f2 // bfmmla z18.s, z23.h, z22.h\n"
+ ".inst 0x6475e6ed // bfmmla z13.s, z23.h, z21.h\n"
+ ".inst 0x6474e6f3 // bfmmla z19.s, z23.h, z20.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
- ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
- ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
- ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -271,9 +269,9 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z12.d, z12.d, z18.d\n"
"uzp1 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z21.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z21.s }, p7/Z, [x21]\n"
"ld1rw { z20.s }, p7/Z, [x20]\n"
"fmin z8.s, p7/M, z8.s, z21.s\n"
"fmin z9.s, p7/M, z9.s, z21.s\n"
@@ -301,10 +299,10 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"bgt 2b\n"
"b 54f\n"
"14:" // Height 2
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p6.s, x20, x9\n"
@@ -321,19 +319,19 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x10, 16f\n"
"ld1w { z8.s }, p7/Z, [x10]\n"
"ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
- "addvl x10, x10, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x10, x10, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -342,25 +340,25 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z25.s }, p6/Z, [x27]\n"
- "ld1w { z24.s }, p5/Z, [x27, #1, MUL VL]\n"
- "ld1w { z23.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "add x20, x27, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x27]\n"
+ "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
"ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
"ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
- "add x20, x27, x20, LSL #2\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
"ld1w { z14.s }, p6/Z, [x20]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
"ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
"ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
"ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
"ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
- "zip1 z8.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
- "zip1 z10.d, z23.d, z16.d\n"
- "zip2 z16.d, z23.d, z16.d\n"
"zip1 z11.d, z22.d, z17.d\n"
"zip2 z17.d, z22.d, z17.d\n"
"zip1 z12.d, z21.d, z18.d\n"
@@ -385,8 +383,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -405,77 +403,77 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
- "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z25.h }, p7/Z, [x28, #3, MUL VL]\n"
- "cmp x25, #0x4\n"
"ld1rqw { z24.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z20.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
"uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
"uzp1 z20.h, z20.h, z20.h\n"
"trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
"ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6479e70f // bfmmla z15.s, z24.h, z25.h\n"
"ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
"ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
"ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
"ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
- "addvl x28, x28, #-4\n"
+ "sub x25, x25, #0x4\n"
+ "cmp x25, #0x4\n"
".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
+ "addvl x28, x28, #-4\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z23.h }, p7/Z, [x28]\n"
- "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
- "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z25.h }, p7/Z, [x28, #3, MUL VL]\n"
"ld1rqw { z24.s }, p0/Z, [x24]\n"
"ld1rqw { z20.s }, p0/Z, [x23]\n"
".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
"uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
"uzp1 z20.h, z20.h, z20.h\n"
"trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
"ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
"ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6479e70f // bfmmla z15.s, z24.h, z25.h\n"
"ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
- "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
"ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
"ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
"ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
- "addvl x28, x28, #-4\n"
".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
+ "addvl x28, x28, #-4\n"
".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"24:" // Height 2: Multiply loop: multiply skip
@@ -486,21 +484,21 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z4.d, z8.d, z14.d\n"
"uzp2 z8.d, z8.d, z14.d\n"
+ "add x23, x27, x20, LSL #2\n"
"uzp1 z14.d, z9.d, z15.d\n"
"uzp2 z9.d, z9.d, z15.d\n"
"uzp1 z15.d, z10.d, z16.d\n"
"uzp2 z10.d, z10.d, z16.d\n"
"uzp1 z16.d, z11.d, z17.d\n"
"uzp2 z11.d, z11.d, z17.d\n"
- "add x24, x27, x20, LSL #2\n"
"uzp1 z17.d, z12.d, z18.d\n"
"uzp2 z12.d, z12.d, z18.d\n"
"uzp1 z18.d, z13.d, z19.d\n"
"uzp2 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z20.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z20.s }, p7/Z, [x21]\n"
"ld1rw { z19.s }, p7/Z, [x20]\n"
"fmin z4.s, p7/M, z4.s, z20.s\n"
"fmin z14.s, p7/M, z14.s, z20.s\n"
@@ -534,22 +532,22 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"st1w { z17.s }, p2, [x27, #4, MUL VL]\n"
"st1w { z18.s }, p1, [x27, #5, MUL VL]\n"
"addvl x27, x27, #6\n"
- "st1w { z8.s }, p6, [x24]\n"
- "st1w { z9.s }, p5, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p4, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p3, [x24, #3, MUL VL]\n"
- "st1w { z12.s }, p2, [x24, #4, MUL VL]\n"
- "st1w { z13.s }, p1, [x24, #5, MUL VL]\n"
+ "st1w { z8.s }, p6, [x23]\n"
+ "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+ "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+ "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+ "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
"26:" // Height 2: Writeback done
"decw x9, ALL, MUL #6\n"
"cmp x9, XZR\n"
"bgt 15b\n"
"b 54f\n"
"27:" // Height 3
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p6.s, x20, x9\n"
@@ -566,19 +564,19 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x10, 29f\n"
"ld1w { z8.s }, p7/Z, [x10]\n"
"ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
- "addvl x10, x10, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x10, x10, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -599,38 +597,38 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z22.s }, p6/Z, [x27]\n"
- "ld1w { z24.s }, p5/Z, [x27, #1, MUL VL]\n"
- "ld1w { z0.s }, p4/Z, [x27, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x27, #3, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x27, #4, MUL VL]\n"
"add x21, x27, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x27]\n"
+ "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
"ld1w { z14.s }, p6/Z, [x21]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
"ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
"ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
"ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
"ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
"ld1w { z21.s }, p6/Z, [x20]\n"
- "zip1 z8.d, z22.d, z14.d\n"
- "zip2 z14.d, z22.d, z14.d\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
"ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
"ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
"ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
"ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
- "zip1 z10.d, z0.d, z16.d\n"
- "zip2 z16.d, z0.d, z16.d\n"
- "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
- "zip1 z11.d, z2.d, z17.d\n"
- "zip2 z17.d, z2.d, z17.d\n"
- "zip1 z12.d, z1.d, z18.d\n"
- "zip2 z18.d, z1.d, z18.d\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
"zip1 z21.d, z22.d, z27.d\n"
@@ -673,8 +671,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -696,52 +694,52 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x28]\n"
- "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z1.h }, p7/Z, [x28, #3, MUL VL]\n"
- "cmp x25, #0x4\n"
"ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "add x22, x22, #0x10\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
"uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
"uzp1 z0.h, z0.h, z0.h\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
"trn1 z5.d, z5.d, z0.d\n"
"uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6466e4a9 // bfmmla z9.s, z5.h, z6.h\n"
- ".inst 0x6461e4af // bfmmla z15.s, z5.h, z1.h\n"
".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "sub x25, x25, #0x4\n"
".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6466e495 // bfmmla z21.s, z4.h, z6.h\n"
- ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
+ "cmp x25, #0x4\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
- ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
- "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
- "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
"ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
- "addvl x28, x28, #-4\n"
".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
+ "addvl x28, x28, #-4\n"
".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
@@ -752,47 +750,47 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x28]\n"
- "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z1.h }, p7/Z, [x28, #3, MUL VL]\n"
"ld1rqw { z5.s }, p0/Z, [x24]\n"
"ld1rqw { z0.s }, p0/Z, [x23]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
"uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
"uzp1 z0.h, z0.h, z0.h\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
"trn1 z5.d, z5.d, z0.d\n"
"uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6466e4a9 // bfmmla z9.s, z5.h, z6.h\n"
- ".inst 0x6461e4af // bfmmla z15.s, z5.h, z1.h\n"
".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6466e495 // bfmmla z21.s, z4.h, z6.h\n"
- ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
- ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
- "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
- "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
"ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
- "addvl x28, x28, #-4\n"
".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
+ "addvl x28, x28, #-4\n"
".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
@@ -806,16 +804,16 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"cmp x26, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
"uzp1 z4.d, z8.d, z14.d\n"
"uzp2 z8.d, z8.d, z14.d\n"
"uzp1 z14.d, z9.d, z15.d\n"
"uzp2 z9.d, z9.d, z15.d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 z15.d, z10.d, z16.d\n"
"uzp2 z10.d, z10.d, z16.d\n"
- "add x24, x27, x20, LSL #2\n"
"uzp1 z16.d, z11.d, z17.d\n"
"uzp2 z11.d, z11.d, z17.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z17.d, z12.d, z18.d\n"
"uzp2 z12.d, z12.d, z18.d\n"
"uzp1 z18.d, z13.d, z19.d\n"
@@ -827,9 +825,9 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z24.d, z24.d, z30.d\n"
"uzp1 z25.d, z25.d, z31.d\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p7/Z, [x21]\n"
"ld1rw { z19.s }, p7/Z, [x20]\n"
"fmin z4.s, p7/M, z4.s, z0.s\n"
"fmin z14.s, p7/M, z14.s, z0.s\n"
@@ -875,18 +873,18 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"st1w { z17.s }, p2, [x27, #4, MUL VL]\n"
"st1w { z18.s }, p1, [x27, #5, MUL VL]\n"
"addvl x27, x27, #6\n"
- "st1w { z8.s }, p6, [x24]\n"
- "st1w { z9.s }, p5, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p4, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p3, [x24, #3, MUL VL]\n"
- "st1w { z12.s }, p2, [x24, #4, MUL VL]\n"
- "st1w { z13.s }, p1, [x24, #5, MUL VL]\n"
- "st1w { z20.s }, p6, [x23]\n"
- "st1w { z21.s }, p5, [x23, #1, MUL VL]\n"
- "st1w { z22.s }, p4, [x23, #2, MUL VL]\n"
- "st1w { z23.s }, p3, [x23, #3, MUL VL]\n"
- "st1w { z24.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z25.s }, p1, [x23, #5, MUL VL]\n"
+ "st1w { z8.s }, p6, [x23]\n"
+ "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+ "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+ "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+ "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+ "st1w { z20.s }, p6, [x22]\n"
+ "st1w { z21.s }, p5, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p4, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p3, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p2, [x22, #4, MUL VL]\n"
+ "st1w { z25.s }, p1, [x22, #5, MUL VL]\n"
"39:" // Height 3: Writeback done
"decw x9, ALL, MUL #6\n"
"cmp x9, XZR\n"
@@ -894,13 +892,12 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"b 54f\n"
"40:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x10\n"
- "ldr x10, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x10, %x[bias]\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p6.s, x20, x9\n"
@@ -917,19 +914,19 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"cbz x10, 42f\n"
"ld1w { z8.s }, p7/Z, [x10]\n"
"ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
- "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
- "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
- "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
- "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
- "addvl x10, x10, #6\n"
"zip2 z14.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
"zip2 z15.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
+ "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
"zip2 z16.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
"zip2 z17.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
+ "addvl x10, x10, #6\n"
"zip2 z18.d, z12.d, z12.d\n"
"zip1 z12.d, z12.d, z12.d\n"
"zip2 z19.d, z13.d, z13.d\n"
@@ -950,51 +947,51 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z22.s }, p6/Z, [x27]\n"
- "ld1w { z24.s }, p5/Z, [x27, #1, MUL VL]\n"
- "ld1w { z26.s }, p4/Z, [x27, #2, MUL VL]\n"
- "ld1w { z27.s }, p3/Z, [x27, #3, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [x27, #4, MUL VL]\n"
"add x22, x27, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x27]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
"ld1w { z14.s }, p6/Z, [x22]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
"ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
"ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
"ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
"ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
"ld1w { z21.s }, p6/Z, [x21]\n"
- "zip1 z8.d, z22.d, z14.d\n"
- "zip2 z14.d, z22.d, z14.d\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
"ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
"ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
- "zip1 z9.d, z24.d, z15.d\n"
- "zip2 z15.d, z24.d, z15.d\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
"ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
"ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
- "zip1 z10.d, z26.d, z16.d\n"
- "zip2 z16.d, z26.d, z16.d\n"
- "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
- "ld1w { z26.s }, p6/Z, [x20]\n"
- "zip1 z11.d, z27.d, z17.d\n"
- "zip2 z17.d, z27.d, z17.d\n"
- "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
- "zip1 z12.d, z29.d, z18.d\n"
- "zip2 z18.d, z29.d, z18.d\n"
- "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
- "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z26.s }, p6/Z, [x20]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
+ "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
"zip1 z21.d, z22.d, z27.d\n"
"zip2 z27.d, z22.d, z27.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z22.d, z23.d, z28.d\n"
"zip2 z28.d, z23.d, z28.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z23.d, z24.d, z29.d\n"
"zip2 z29.d, z24.d, z29.d\n"
"zip1 z24.d, z25.d, z30.d\n"
@@ -1031,8 +1028,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"mov x26, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1057,136 +1054,136 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x28]\n"
- "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
- "sub x25, x25, #0x4\n"
- "ld1h { z7.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #3, MUL VL]\n"
- "cmp x25, #0x4\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z0.s }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
"uzp1 z4.h, z4.h, z4.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z5.d, z5.d, z1.d\n"
- "trn1 z4.d, z4.d, z0.d\n"
- ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
+ "sub x25, x25, #0x4\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6467e4a9 // bfmmla z9.s, z5.h, z7.h\n"
- ".inst 0x6467e495 // bfmmla z21.s, z4.h, z7.h\n"
- ".inst 0x6466e4af // bfmmla z15.s, z5.h, z6.h\n"
+ "cmp x25, #0x4\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
- ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
- ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
- ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
- ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
- ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
- ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
- ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
- ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
- ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
- ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
- ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
- ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
- ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1h { z3.h }, p7/Z, [x28]\n"
- "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #3, MUL VL]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z0.s }, p0/Z, [x21]\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
"uzp1 z4.h, z4.h, z4.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z5.d, z5.d, z1.d\n"
- "trn1 z4.d, z4.d, z0.d\n"
- ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
- ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
- ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6467e4a9 // bfmmla z9.s, z5.h, z7.h\n"
- ".inst 0x6467e495 // bfmmla z21.s, z4.h, z7.h\n"
- ".inst 0x6466e4af // bfmmla z15.s, z5.h, z6.h\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
- ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
- ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
- ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
- ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
- ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
- ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
"ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
"ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
"ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
"ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
- ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
- ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
- ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
- ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
- ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
- ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
- ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x27, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp1 z4.d, z8.d, z14.d\n"
"uzp2 z8.d, z8.d, z14.d\n"
"uzp1 z14.d, z9.d, z15.d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z15.d\n"
"uzp1 z15.d, z10.d, z16.d\n"
"uzp2 z10.d, z10.d, z16.d\n"
- "add x24, x27, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z16.d, z11.d, z17.d\n"
"uzp2 z11.d, z11.d, z17.d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z17.d, z12.d, z18.d\n"
"uzp2 z12.d, z12.d, z18.d\n"
"uzp1 z18.d, z13.d, z19.d\n"
@@ -1204,9 +1201,9 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z30.d, z25.d, z31.d\n"
"uzp2 z25.d, z25.d, z31.d\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p7/Z, [x21]\n"
"ld1rw { z0.s }, p7/Z, [x20]\n"
"fmin z4.s, p7/M, z4.s, z1.s\n"
"fmin z14.s, p7/M, z14.s, z1.s\n"
@@ -1264,24 +1261,24 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"st1w { z17.s }, p2, [x27, #4, MUL VL]\n"
"st1w { z18.s }, p1, [x27, #5, MUL VL]\n"
"addvl x27, x27, #6\n"
- "st1w { z8.s }, p6, [x24]\n"
- "st1w { z9.s }, p5, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p4, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p3, [x24, #3, MUL VL]\n"
- "st1w { z12.s }, p2, [x24, #4, MUL VL]\n"
- "st1w { z13.s }, p1, [x24, #5, MUL VL]\n"
- "st1w { z19.s }, p6, [x23]\n"
- "st1w { z26.s }, p5, [x23, #1, MUL VL]\n"
- "st1w { z27.s }, p4, [x23, #2, MUL VL]\n"
- "st1w { z28.s }, p3, [x23, #3, MUL VL]\n"
- "st1w { z29.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z30.s }, p1, [x23, #5, MUL VL]\n"
- "st1w { z20.s }, p6, [x22]\n"
- "st1w { z21.s }, p5, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p4, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p3, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p2, [x22, #4, MUL VL]\n"
- "st1w { z25.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z8.s }, p6, [x23]\n"
+ "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+ "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+ "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+ "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+ "st1w { z19.s }, p6, [x22]\n"
+ "st1w { z26.s }, p5, [x22, #1, MUL VL]\n"
+ "st1w { z27.s }, p4, [x22, #2, MUL VL]\n"
+ "st1w { z28.s }, p3, [x22, #3, MUL VL]\n"
+ "st1w { z29.s }, p2, [x22, #4, MUL VL]\n"
+ "st1w { z30.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z20.s }, p6, [x21]\n"
+ "st1w { z21.s }, p5, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p4, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p3, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p2, [x21, #4, MUL VL]\n"
+ "st1w { z25.s }, p1, [x21, #5, MUL VL]\n"
"52:" // Height 4: Writeback done
"decw x9, ALL, MUL #6\n"
"cmp x9, XZR\n"
@@ -1298,8 +1295,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"54:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index 823d839289..230c0b77d7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
index 11526b6f65..d31e68993e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
@@ -48,19 +48,18 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
- const float *bias = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -81,7 +80,6 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
ka.string_lengths = string_lengths;
ka.N = N;
ka.B_ptr = B_ptr;
- ka.bias = bias;
switch(act.type) {
default:
case Activation::Type::None:
@@ -105,10 +103,10 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cmp %x[M], #0x2\n"
"bgt 27f\n"
"beq 14f\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -121,26 +119,26 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cbz x12, 3f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"b 5f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 4f\n"
- "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
"ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z16.d, z12.d\n"
+ "zip2 z12.d, z16.d, z12.d\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z8.d, z19.d, z12.d\n"
- "zip2 z12.d, z19.d, z12.d\n"
"zip1 z9.d, z18.d, z13.d\n"
"zip2 z13.d, z18.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
@@ -161,8 +159,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -178,52 +176,52 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
"ld1rqw { z18.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
"uzp1 z18.h, z18.h, z18.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "add x26, x26, #0x10\n"
+ "addvl x10, x10, #8\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"ld1rqw { z18.s }, p0/Z, [x26]\n"
".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
"uzp1 z18.h, z18.h, z18.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
- "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "addvl x10, x10, #8\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -234,9 +232,9 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z10.d, z10.d, z14.d\n"
"uzp1 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 12f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z8.s, p5/M, z8.s, z17.s\n"
"fmin z9.s, p5/M, z9.s, z17.s\n"
@@ -258,10 +256,10 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -274,34 +272,34 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cbz x12, 16f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"b 18f\n"
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z19.s }, p4/Z, [x9]\n"
"ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
- "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z8.d, z19.d, z12.d\n"
"zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z9.d, z18.d, z13.d\n"
"zip2 z13.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z10.d, z17.d, z14.d\n"
"zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
@@ -320,8 +318,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -340,61 +338,61 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z19.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
- "ld1rqw { z18.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqw { z16.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqw { z19.s }, p0/Z, [x26]\n"
+ "ld1rqw { z18.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab673 // bfcvt z19.h, p5/M, z19.s\n"
".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
- ".inst 0x658ab610 // bfcvt z16.h, p5/M, z16.s\n"
+ "uzp1 z19.h, z19.h, z19.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"uzp1 z18.h, z18.h, z18.h\n"
- "uzp1 z16.h, z16.h, z16.h\n"
- "trn1 z18.d, z18.d, z16.d\n"
- ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ "trn1 z19.d, z19.d, z18.d\n"
+ ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6473e64c // bfmmla z12.s, z18.h, z19.h\n"
+ ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
+ ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"addvl x10, x10, #8\n"
- ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
- ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1h { z17.h }, p5/Z, [x10]\n"
- "ld1h { z19.h }, p5/Z, [x10, #1, MUL VL]\n"
- "ld1rqw { z18.s }, p0/Z, [x26]\n"
- "ld1rqw { z16.s }, p0/Z, [x25]\n"
+ "ld1rqw { z19.s }, p0/Z, [x26]\n"
+ "ld1rqw { z18.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab673 // bfcvt z19.h, p5/M, z19.s\n"
".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
- ".inst 0x658ab610 // bfcvt z16.h, p5/M, z16.s\n"
+ "uzp1 z19.h, z19.h, z19.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
"uzp1 z18.h, z18.h, z18.h\n"
- "uzp1 z16.h, z16.h, z16.h\n"
- "trn1 z18.d, z18.d, z16.d\n"
- ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ "trn1 z19.d, z19.d, z18.d\n"
+ ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6473e64c // bfmmla z12.s, z18.h, z19.h\n"
+ ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n"
"ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
"ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
- ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -403,17 +401,17 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z6.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
+ "add x25, x9, x20, LSL #2\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z17.s }, p5/Z, [x21]\n"
"ld1rw { z16.s }, p5/Z, [x20]\n"
"fmin z6.s, p5/M, z6.s, z17.s\n"
"fmin z12.s, p5/M, z12.s, z17.s\n"
@@ -437,20 +435,20 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
"26:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -463,15 +461,15 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cbz x12, 29f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -486,28 +484,28 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z24.s }, p4/Z, [x9]\n"
- "ld1w { z26.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x21, x9, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
- "zip1 z8.d, z24.d, z12.d\n"
- "zip2 z12.d, z24.d, z12.d\n"
- "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z9.d, z26.d, z13.d\n"
- "zip2 z13.d, z26.d, z13.d\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -538,8 +536,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -561,83 +559,83 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
- "ld1rqw { z27.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqw { z24.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqw { z28.s }, p0/Z, [x26]\n"
+ "ld1rqw { z27.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
"ld1rqw { z26.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
- ".inst 0x658ab718 // bfcvt z24.h, p5/M, z24.s\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"uzp1 z27.h, z27.h, z27.h\n"
".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
- "uzp1 z24.h, z24.h, z24.h\n"
- "trn1 z27.d, z27.d, z24.d\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x27, x27, #0x4\n"
+ "trn1 z28.d, z28.d, z27.d\n"
"uzp1 z26.h, z26.h, z26.h\n"
- ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
+ "cmp x27, #0x4\n"
".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
+ "add x25, x25, #0x10\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ "add x24, x24, #0x10\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1h { z25.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "ld1rqw { z27.s }, p0/Z, [x26]\n"
- "ld1rqw { z24.s }, p0/Z, [x25]\n"
+ "ld1rqw { z28.s }, p0/Z, [x26]\n"
+ "ld1rqw { z27.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
"ld1rqw { z26.s }, p0/Z, [x24]\n"
".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
- ".inst 0x658ab718 // bfcvt z24.h, p5/M, z24.s\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"uzp1 z27.h, z27.h, z27.h\n"
- "uzp1 z24.h, z24.h, z24.h\n"
".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
- "trn1 z27.d, z27.d, z24.d\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "trn1 z28.d, z28.d, z27.d\n"
"uzp1 z26.h, z26.h, z26.h\n"
- ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -645,24 +643,24 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
"uzp1 z6.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 z16.d, z16.d, z20.d\n"
"uzp1 z17.d, z17.d, z21.d\n"
"uzp1 z18.d, z18.d, z22.d\n"
"uzp1 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 38f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z25.s }, p5/Z, [x21]\n"
"ld1rw { z24.s }, p5/Z, [x20]\n"
"fmin z6.s, p5/M, z6.s, z25.s\n"
"fmin z12.s, p5/M, z12.s, z25.s\n"
@@ -694,24 +692,24 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x25]\n"
- "st1w { z17.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x24]\n"
+ "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
"39:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -724,15 +722,15 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cbz x12, 42f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -747,37 +745,37 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x20]\n"
- "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -804,8 +802,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -830,110 +828,110 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
- "ld1rqw { z27.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqw { z25.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqw { z26.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqw { z24.s }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "ld1rqw { z29.s }, p0/Z, [x26]\n"
+ "ld1rqw { z28.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab7bd // bfcvt z29.h, p5/M, z29.s\n"
+ "ld1rqw { z27.s }, p0/Z, [x24]\n"
+ "ld1rqw { z26.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
- ".inst 0x658ab739 // bfcvt z25.h, p5/M, z25.s\n"
".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
- ".inst 0x658ab718 // bfcvt z24.h, p5/M, z24.s\n"
+ "uzp1 z29.h, z29.h, z29.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
"uzp1 z27.h, z27.h, z27.h\n"
- "uzp1 z25.h, z25.h, z25.h\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"uzp1 z26.h, z26.h, z26.h\n"
- "uzp1 z24.h, z24.h, z24.h\n"
- "trn1 z27.d, z27.d, z25.d\n"
- "trn1 z26.d, z26.d, z24.d\n"
- ".inst 0x647de768 // bfmmla z8.s, z27.h, z29.h\n"
- ".inst 0x647de750 // bfmmla z16.s, z26.h, z29.h\n"
+ "trn1 z29.d, z29.d, z28.d\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "add x26, x26, #0x10\n"
+ "trn1 z27.d, z27.d, z26.d\n"
+ ".inst 0x6479e770 // bfmmla z16.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
+ ".inst 0x6478e774 // bfmmla z20.s, z27.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
- ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6479e771 // bfmmla z17.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
- ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6478e775 // bfmmla z21.s, z27.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
- ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6479e772 // bfmmla z18.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
- ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ ".inst 0x6478e776 // bfmmla z22.s, z27.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
- ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
- ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
+ ".inst 0x6479e773 // bfmmla z19.s, z27.h, z25.h\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e777 // bfmmla z23.s, z27.h, z24.h\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1h { z29.h }, p5/Z, [x10]\n"
- "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
- "ld1rqw { z27.s }, p0/Z, [x26]\n"
- "ld1rqw { z25.s }, p0/Z, [x25]\n"
- "ld1rqw { z26.s }, p0/Z, [x24]\n"
- "ld1rqw { z24.s }, p0/Z, [x23]\n"
+ "ld1rqw { z29.s }, p0/Z, [x26]\n"
+ "ld1rqw { z28.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab7bd // bfcvt z29.h, p5/M, z29.s\n"
+ "ld1rqw { z27.s }, p0/Z, [x24]\n"
+ "ld1rqw { z26.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
- ".inst 0x658ab739 // bfcvt z25.h, p5/M, z25.s\n"
".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
- ".inst 0x658ab718 // bfcvt z24.h, p5/M, z24.s\n"
+ "uzp1 z29.h, z29.h, z29.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
"uzp1 z27.h, z27.h, z27.h\n"
- "uzp1 z25.h, z25.h, z25.h\n"
"uzp1 z26.h, z26.h, z26.h\n"
- "uzp1 z24.h, z24.h, z24.h\n"
- "trn1 z27.d, z27.d, z25.d\n"
- "trn1 z26.d, z26.d, z24.d\n"
- ".inst 0x647de768 // bfmmla z8.s, z27.h, z29.h\n"
- ".inst 0x647de750 // bfmmla z16.s, z26.h, z29.h\n"
+ "trn1 z29.d, z29.d, z28.d\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "trn1 z27.d, z27.d, z26.d\n"
+ ".inst 0x6479e770 // bfmmla z16.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647ce76c // bfmmla z12.s, z27.h, z28.h\n"
- ".inst 0x647ce754 // bfmmla z20.s, z26.h, z28.h\n"
+ ".inst 0x6478e774 // bfmmla z20.s, z27.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
- ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ ".inst 0x6479e771 // bfmmla z17.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
- ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ ".inst 0x6478e775 // bfmmla z21.s, z27.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
- ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e772 // bfmmla z18.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
"ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
- ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ ".inst 0x6478e776 // bfmmla z22.s, z27.h, z24.h\n"
"ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
- ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
- ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
- ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
+ ".inst 0x6479e773 // bfmmla z19.s, z27.h, z25.h\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e777 // bfmmla z23.s, z27.h, z24.h\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z6.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
@@ -943,9 +941,9 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 51f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z24.s }, p5/Z, [x21]\n"
"ld1rw { z23.s }, p5/Z, [x20]\n"
"fmin z6.s, p5/M, z6.s, z24.s\n"
"fmin z12.s, p5/M, z12.s, z24.s\n"
@@ -985,28 +983,28 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
"52:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1019,15 +1017,15 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cbz x12, 55f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -1050,46 +1048,46 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x21]\n"
- "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x20]\n"
- "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
@@ -1128,8 +1126,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1157,115 +1155,115 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z5.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqw { z0.s }, p0/Z, [x23]\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1rqw { z6.s }, p0/Z, [x26]\n"
+ "ld1rqw { z5.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x24]\n"
+ "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
+ "ld1rqw { z2.s }, p0/Z, [x22]\n"
".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
"uzp1 z4.h, z4.h, z4.h\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "uzp1 z1.h, z1.h, z1.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x27, x27, #0x4\n"
"uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z4.d, z4.d, z1.d\n"
+ ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
+ "cmp x27, #0x4\n"
+ "add x26, x26, #0x10\n"
+ "trn1 z6.d, z6.d, z5.d\n"
+ "trn1 z4.d, z4.d, z3.d\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ "add x25, x25, #0x10\n"
"uzp1 z2.h, z2.h, z2.h\n"
- "trn1 z3.d, z3.d, z0.d\n"
- ".inst 0x6466e488 // bfmmla z8.s, z4.h, z6.h\n"
- ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
- ".inst 0x6465e48c // bfmmla z12.s, z4.h, z5.h\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e470 // bfmmla z16.s, z3.h, z6.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6465e474 // bfmmla z20.s, z3.h, z5.h\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e489 // bfmmla z9.s, z4.h, z1.h\n"
- ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6460e48d // bfmmla z13.s, z4.h, z0.h\n"
- ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6461e48a // bfmmla z10.s, z4.h, z1.h\n"
- ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6460e48e // bfmmla z14.s, z4.h, z0.h\n"
- ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6461e48b // bfmmla z11.s, z4.h, z1.h\n"
- ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
- ".inst 0x6460e48f // bfmmla z15.s, z4.h, z0.h\n"
- ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z5.h }, p5/Z, [x10, #1, MUL VL]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "ld1rqw { z0.s }, p0/Z, [x23]\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
+ "ld1rqw { z6.s }, p0/Z, [x26]\n"
+ "ld1rqw { z5.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x24]\n"
+ "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
+ "ld1rqw { z2.s }, p0/Z, [x22]\n"
".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
"uzp1 z4.h, z4.h, z4.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
"uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "trn1 z4.d, z4.d, z1.d\n"
+ ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
+ "trn1 z6.d, z6.d, z5.d\n"
+ "trn1 z4.d, z4.d, z3.d\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
"uzp1 z2.h, z2.h, z2.h\n"
- "trn1 z3.d, z3.d, z0.d\n"
- ".inst 0x6466e488 // bfmmla z8.s, z4.h, z6.h\n"
- ".inst 0x6465e48c // bfmmla z12.s, z4.h, z5.h\n"
- ".inst 0x6466e470 // bfmmla z16.s, z3.h, z6.h\n"
- ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6465e474 // bfmmla z20.s, z3.h, z5.h\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e489 // bfmmla z9.s, z4.h, z1.h\n"
- ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6460e48d // bfmmla z13.s, z4.h, z0.h\n"
- ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6461e48a // bfmmla z10.s, z4.h, z1.h\n"
- ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
"ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6460e48e // bfmmla z14.s, z4.h, z0.h\n"
- ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6461e48b // bfmmla z11.s, z4.h, z1.h\n"
- ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
- ".inst 0x6460e48f // bfmmla z15.s, z4.h, z0.h\n"
- ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1273,20 +1271,20 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 58b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z6.d, z8.d, z12.d\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
@@ -1298,9 +1296,9 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 64f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x21]\n"
"ld1rw { z23.s }, p5/Z, [x20]\n"
"fmin z6.s, p5/M, z6.s, z0.s\n"
"fmin z12.s, p5/M, z12.s, z0.s\n"
@@ -1348,22 +1346,22 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x22]\n"
+ "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
"65:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1371,13 +1369,12 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
- "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
+ "mov x12, %x[bias]\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1390,15 +1387,15 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"cbz x12, 68f\n"
"ld1w { z8.s }, p5/Z, [x12]\n"
"ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
- "addvl x12, x12, #4\n"
"zip2 z12.d, z8.d, z8.d\n"
"zip1 z8.d, z8.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
"zip2 z13.d, z9.d, z9.d\n"
"zip1 z9.d, z9.d, z9.d\n"
"zip2 z14.d, z10.d, z10.d\n"
"zip1 z10.d, z10.d, z10.d\n"
+ "addvl x12, x12, #4\n"
"zip2 z15.d, z11.d, z11.d\n"
"zip1 z11.d, z11.d, z11.d\n"
"mov z16.d, z8.d\n"
@@ -1421,54 +1418,54 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z28.s }, p4/Z, [x20]\n"
- "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
@@ -1504,8 +1501,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"mov x28, #0x0\n"
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1536,146 +1533,146 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x4\n"
- "cmp x27, #0x4\n"
- "ld1rqw { z5.s }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "ld1rqw { z0.s }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1rqw { z7.s }, p0/Z, [x26]\n"
+ "ld1rqw { z6.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4e7 // bfcvt z7.h, p5/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "ld1rqw { z2.s }, p0/Z, [x21]\n"
".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
- "add x21, x21, #0x10\n"
".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
+ "sub x27, x27, #0x4\n"
+ "cmp x27, #0x4\n"
"uzp1 z4.h, z4.h, z4.h\n"
"uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"uzp1 z2.h, z2.h, z2.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
+ "add x24, x24, #0x10\n"
"trn1 z5.d, z5.d, z4.d\n"
- "trn1 z3.d, z3.d, z1.d\n"
- "trn1 z2.d, z2.d, z0.d\n"
- ".inst 0x6467e4a8 // bfmmla z8.s, z5.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6466e4ac // bfmmla z12.s, z5.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
+ "trn1 z3.d, z3.d, z2.d\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e45c // bfmmla z28.s, z2.h, z6.h\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
- ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
- ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n"
- ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
- ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n"
- ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
- ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n"
- ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
- ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
- ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
- ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
- ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
- ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
- ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "ld1rqw { z5.s }, p0/Z, [x26]\n"
- "ld1rqw { z4.s }, p0/Z, [x25]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "ld1rqw { z0.s }, p0/Z, [x21]\n"
+ "ld1rqw { z7.s }, p0/Z, [x26]\n"
+ "ld1rqw { z6.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4e7 // bfcvt z7.h, p5/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "ld1rqw { z2.s }, p0/Z, [x21]\n"
".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
"uzp1 z4.h, z4.h, z4.h\n"
"uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z1.h, z1.h, z1.h\n"
"uzp1 z2.h, z2.h, z2.h\n"
- "uzp1 z0.h, z0.h, z0.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
"trn1 z5.d, z5.d, z4.d\n"
- "trn1 z3.d, z3.d, z1.d\n"
- "trn1 z2.d, z2.d, z0.d\n"
- ".inst 0x6467e4a8 // bfmmla z8.s, z5.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6466e4ac // bfmmla z12.s, z5.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
+ "trn1 z3.d, z3.d, z2.d\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e45c // bfmmla z28.s, z2.h, z6.h\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
- ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
- ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n"
- ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
- ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n"
- ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
- ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
"ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n"
- ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
- ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
"ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
- ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
- ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
- ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
- ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
- ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 71b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x25, x9, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
"uzp1 z6.d, z8.d, z12.d\n"
+ "add x23, x24, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x26, x9, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x23, x24, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
@@ -1691,9 +1688,9 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 77f\n"
- "add x21, %x[args_ptr], %[offset_max]\n"
+ "add x20, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z1.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z1.s }, p5/Z, [x21]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
"fmin z6.s, p5/M, z6.s, z1.s\n"
"fmin z12.s, p5/M, z12.s, z1.s\n"
@@ -1749,26 +1746,26 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x26]\n"
- "st1w { z9.s }, p3, [x26, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x26, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x26, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x25]\n"
- "st1w { z20.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x25, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x24]\n"
- "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z23.s }, p4, [x23]\n"
- "st1w { z28.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z30.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x22]\n"
- "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x25]\n"
+ "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+ "st1w { z15.s }, p4, [x24]\n"
+ "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z23.s }, p4, [x22]\n"
+ "st1w { z28.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z29.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z30.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
"78:" // Height 6: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1785,8 +1782,8 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index a8e82516a1..dc008866a1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 4, 4, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
index 7de5e09bd5..b06e0bd3c3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -45,18 +45,18 @@ void sve_hybrid_s8qa_dot_4x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -91,20 +91,20 @@ void sve_hybrid_s8qa_dot_4x4VL (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"3:" // Height 1: setup done
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -120,41 +120,41 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ble 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z21.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z23.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z22.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "sdot z16.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- "sdot z17.s, z26.b, z0.b[0]\n"
- "sdot z18.s, z25.b, z0.b[0]\n"
- "sdot z19.s, z24.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z17.s, z21.b, z0.b[0]\n"
+ "sdot z18.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z16.s, z20.b, z0.b[1]\n"
- "ld1b { z20.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "sdot z17.s, z23.b, z0.b[1]\n"
- "ld1b { z23.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "sdot z18.s, z22.b, z0.b[1]\n"
- "ld1b { z22.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "sdot z19.s, z21.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "sdot z17.s, z21.b, z0.b[1]\n"
+ "sdot z18.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
+ "sdot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "sdot z16.s, z22.b, z0.b[2]\n"
+ "sdot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[2]\n"
+ "sdot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
"ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "sdot z16.s, z20.b, z0.b[2]\n"
+ "sdot z16.s, z22.b, z0.b[3]\n"
+ "sdot z17.s, z20.b, z0.b[3]\n"
"ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "sdot z17.s, z26.b, z0.b[2]\n"
- "sdot z18.s, z25.b, z0.b[2]\n"
- "sdot z19.s, z24.b, z0.b[2]\n"
- "sdot z16.s, z23.b, z0.b[3]\n"
- "sdot z17.s, z22.b, z0.b[3]\n"
"sdot z18.s, z21.b, z0.b[3]\n"
"sdot z19.s, z20.b, z0.b[3]\n"
+ "add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"sdot z11.s, z0.b, z15.b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
@@ -163,49 +163,49 @@ void sve_hybrid_s8qa_dot_4x4VL (
"bgt 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1b { z22.b }, p2/Z, [x28]\n"
"subs x25, x25, #0x4\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z16.s, z22.b, z0.b[0]\n"
+ "sdot z17.s, z20.b, z0.b[0]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "sdot z16.s, z23.b, z0.b[0]\n"
- "sdot z17.s, z22.b, z0.b[0]\n"
"sdot z18.s, z21.b, z0.b[0]\n"
"sdot z19.s, z20.b, z0.b[0]\n"
+ "addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
"ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z20.b, z0.b[1]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z23.b, z0.b[1]\n"
"sdot z17.s, z22.b, z0.b[1]\n"
"sdot z18.s, z21.b, z0.b[1]\n"
"sdot z19.s, z20.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
"ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z20.b, z0.b[2]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z23.b, z0.b[2]\n"
"sdot z17.s, z22.b, z0.b[2]\n"
"sdot z18.s, z21.b, z0.b[2]\n"
"sdot z19.s, z20.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z0.b[3]\n"
+ "sdot z17.s, z20.b, z0.b[3]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z23.b, z0.b[3]\n"
- "sdot z17.s, z22.b, z0.b[3]\n"
"sdot z18.s, z21.b, z0.b[3]\n"
"sdot z19.s, z20.b, z0.b[3]\n"
+ "addvl x28, x28, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -215,35 +215,35 @@ void sve_hybrid_s8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 4b\n"
"tbnz %x[flags], #31, 12f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z20.s, p2/M, z20.s\n"
"saddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
+ "neg z20.s, p2/M, z20.s\n"
"mul z11.s, p2/M, z11.s, z20.s\n"
"12:" // Height 1: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
"ld1w { z23.s }, p2/Z, [x10]\n"
- "ld1w { z20.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z22.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z16.s, z16.s, z23.s\n"
- "add z17.s, z17.s, z20.s\n"
+ "add z17.s, z17.s, z22.s\n"
+ "add z18.s, z18.s, z21.s\n"
+ "add z19.s, z19.s, z20.s\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z22.s\n"
- "add z19.s, z19.s, z21.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n"
+ "addvl x10, x10, #4\n"
".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
"tbz %x[flags], #5, 13f\n"
@@ -261,19 +261,19 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z19.s, z19.s, z20.s\n"
"13:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z22.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z20.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z20.s\n"
+ "add z18.s, z18.s, z20.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
"ld1rw { z21.s }, p2/Z, [x20]\n"
- "add z16.s, z16.s, z22.s\n"
+ "add z19.s, z19.s, z20.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z22.s\n"
- "add z18.s, z18.s, z22.s\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z22.s\n"
"smin z16.s, p2/M, z16.s, z21.s\n"
"smin z17.s, p2/M, z17.s, z21.s\n"
"smin z18.s, p2/M, z18.s, z21.s\n"
@@ -281,8 +281,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z20.s\n"
"smax z17.s, p2/M, z17.s, z20.s\n"
"smax z18.s, p2/M, z18.s, z20.s\n"
- "smax z19.s, p2/M, z19.s, z20.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "smax z19.s, p2/M, z19.s, z20.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
"st1b { z16.b }, p1, [x27]\n"
@@ -300,24 +300,24 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"17:" // Height 2: setup done
"mov x26, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -336,45 +336,45 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ble 23f\n"
"21:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z25.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[0]\n"
+ "sdot z20.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z26.b, z0.b[0]\n"
+ "sdot z21.s, z26.b, z1.b[0]\n"
+ "sdot z18.s, z24.b, z0.b[0]\n"
+ "sdot z22.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "sdot z19.s, z25.b, z0.b[0]\n"
+ "sdot z23.s, z25.b, z1.b[0]\n"
"ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
- "sdot z16.s, z25.b, z0.b[0]\n"
- "sdot z20.s, z25.b, z1.b[0]\n"
"ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "sdot z17.s, z30.b, z0.b[0]\n"
- "sdot z21.s, z30.b, z1.b[0]\n"
- "sdot z18.s, z29.b, z0.b[0]\n"
- "sdot z22.s, z29.b, z1.b[0]\n"
- "sdot z19.s, z28.b, z0.b[0]\n"
- "sdot z23.s, z28.b, z1.b[0]\n"
"sdot z16.s, z24.b, z0.b[1]\n"
"sdot z20.s, z24.b, z1.b[1]\n"
"ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z17.s, z27.b, z0.b[1]\n"
"sdot z21.s, z27.b, z1.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
"ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z18.s, z26.b, z0.b[1]\n"
"sdot z22.s, z26.b, z1.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
"ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z19.s, z25.b, z0.b[1]\n"
"sdot z23.s, z25.b, z1.b[1]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
"ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z16.s, z24.b, z0.b[2]\n"
"sdot z20.s, z24.b, z1.b[2]\n"
"ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z30.b, z0.b[2]\n"
"sdot z21.s, z30.b, z1.b[2]\n"
"sdot z18.s, z29.b, z0.b[2]\n"
@@ -398,34 +398,34 @@ void sve_hybrid_s8qa_dot_4x4VL (
"bgt 21b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x25, x25, #0x4\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[0]\n"
+ "sdot z20.s, z24.b, z1.b[0]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "sdot z16.s, z27.b, z0.b[0]\n"
- "sdot z20.s, z27.b, z1.b[0]\n"
"sdot z17.s, z26.b, z0.b[0]\n"
"sdot z21.s, z26.b, z1.b[0]\n"
"sdot z18.s, z25.b, z0.b[0]\n"
"sdot z22.s, z25.b, z1.b[0]\n"
+ "addvl x28, x28, #4\n"
"sdot z19.s, z24.b, z0.b[0]\n"
"sdot z23.s, z24.b, z1.b[0]\n"
"ble 24f\n"
"ld1b { z27.b }, p2/Z, [x28]\n"
"ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z27.b, z0.b[1]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z27.b, z0.b[1]\n"
"sdot z20.s, z27.b, z1.b[1]\n"
"sdot z17.s, z26.b, z0.b[1]\n"
"sdot z21.s, z26.b, z1.b[1]\n"
"sdot z18.s, z25.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
"sdot z22.s, z25.b, z1.b[1]\n"
"sdot z19.s, z24.b, z0.b[1]\n"
"sdot z23.s, z24.b, z1.b[1]\n"
@@ -433,29 +433,29 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1b { z27.b }, p2/Z, [x28]\n"
"ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z27.b, z0.b[2]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z27.b, z0.b[2]\n"
"sdot z20.s, z27.b, z1.b[2]\n"
"sdot z17.s, z26.b, z0.b[2]\n"
"sdot z21.s, z26.b, z1.b[2]\n"
"sdot z18.s, z25.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
"sdot z22.s, z25.b, z1.b[2]\n"
"sdot z19.s, z24.b, z0.b[2]\n"
"sdot z23.s, z24.b, z1.b[2]\n"
"ble 24f\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
"ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[3]\n"
+ "sdot z20.s, z24.b, z1.b[3]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z27.b, z0.b[3]\n"
- "sdot z20.s, z27.b, z1.b[3]\n"
"sdot z17.s, z26.b, z0.b[3]\n"
"sdot z21.s, z26.b, z1.b[3]\n"
"sdot z18.s, z25.b, z0.b[3]\n"
"sdot z22.s, z25.b, z1.b[3]\n"
+ "addvl x28, x28, #4\n"
"sdot z19.s, z24.b, z0.b[3]\n"
"sdot z23.s, z24.b, z1.b[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
@@ -468,18 +468,18 @@ void sve_hybrid_s8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
+ "add x23, x27, x20\n"
"tbnz %x[flags], #31, 26f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z24.s, p2/M, z24.s\n"
"saddv d11, p0, z11.s\n"
- "saddv d12, p0, z12.s\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z24.s\n"
+ "saddv d12, p0, z12.s\n"
+ "neg z24.s, p2/M, z24.s\n"
"mov z12.s, z12.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z24.s\n"
"mul z12.s, p2/M, z12.s, z24.s\n"
"26:" // Height 2: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
@@ -518,24 +518,24 @@ void sve_hybrid_s8qa_dot_4x4VL (
".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
"and z24.d, z16.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z24.s\n"
"and z30.d, z17.d, z0.d\n"
"and z29.d, z18.d, z0.d\n"
"and z28.d, z19.d, z0.d\n"
"and z27.d, z20.d, z0.d\n"
"and z26.d, z21.d, z0.d\n"
- "asr z24.s, z24.s, #0x1f\n"
"and z25.d, z22.d, z0.d\n"
+ "and z24.d, z23.d, z0.d\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"asr z27.s, z27.s, #0x1f\n"
- "sqadd z16.s, z16.s, z24.s\n"
- "and z24.d, z23.d, z0.d\n"
"asr z26.s, z26.s, #0x1f\n"
"asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
"sqadd z17.s, z17.s, z30.s\n"
"sqadd z18.s, z18.s, z29.s\n"
- "asr z24.s, z24.s, #0x1f\n"
"sqadd z19.s, z19.s, z28.s\n"
"sqadd z20.s, z20.s, z27.s\n"
"sqadd z21.s, z21.s, z26.s\n"
@@ -543,27 +543,27 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z23.s, z23.s, z24.s\n"
"27:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add z20.s, z20.s, z24.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z26.s\n"
+ "add z21.s, z21.s, z24.s\n"
+ "add z22.s, z22.s, z24.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z26.s\n"
- "add z18.s, z18.s, z26.s\n"
"ld1rw { z25.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z26.s\n"
- "add z20.s, z20.s, z26.s\n"
+ "add z23.s, z23.s, z24.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z21.s, z21.s, z26.s\n"
- "add z22.s, z22.s, z26.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z26.s\n"
"smin z16.s, p2/M, z16.s, z25.s\n"
"smin z17.s, p2/M, z17.s, z25.s\n"
"smin z18.s, p2/M, z18.s, z25.s\n"
@@ -575,20 +575,20 @@ void sve_hybrid_s8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z24.s\n"
"smax z17.s, p2/M, z17.s, z24.s\n"
"smax z18.s, p2/M, z18.s, z24.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"smax z19.s, p2/M, z19.s, z24.s\n"
"smax z20.s, p2/M, z20.s, z24.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z21.s, p2/M, z21.s, z24.s\n"
"smax z22.s, p2/M, z22.s, z24.s\n"
- "smax z23.s, p2/M, z23.s, z24.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z18.h, z18.h, z19.h\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z17.h, z22.h, z23.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z20.b, z20.b, z17.b\n"
"st1b { z16.b }, p1, [x27]\n"
+ "smax z23.s, p2/M, z23.s, z24.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "st1b { z20.b }, p1, [x23]\n"
"addvl x27, x27, #1\n"
- "st1b { z20.b }, p1, [x24]\n"
"28:" // Height 2: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -603,16 +603,16 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -623,8 +623,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov x26, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -646,57 +646,57 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ble 37f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z28.b, z0.b[0]\n"
+ "sdot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z28.b, z2.b[0]\n"
+ "sdot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z21.s, z30.b, z1.b[0]\n"
+ "sdot z25.s, z30.b, z2.b[0]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "sdot z18.s, z29.b, z0.b[0]\n"
+ "sdot z22.s, z29.b, z1.b[0]\n"
+ "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x22, x22, #0x10\n"
- "sdot z16.s, z5.b, z0.b[0]\n"
- "sdot z20.s, z5.b, z1.b[0]\n"
- "sdot z17.s, z29.b, z0.b[0]\n"
- "sdot z21.s, z29.b, z1.b[0]\n"
- "sdot z18.s, z4.b, z0.b[0]\n"
- "sdot z24.s, z5.b, z2.b[0]\n"
- "sdot z25.s, z29.b, z2.b[0]\n"
+ "sdot z26.s, z29.b, z2.b[0]\n"
+ "sdot z19.s, z28.b, z0.b[0]\n"
"ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "sdot z22.s, z4.b, z1.b[0]\n"
- "sdot z26.s, z4.b, z2.b[0]\n"
- "sdot z19.s, z28.b, z0.b[0]\n"
"sdot z23.s, z28.b, z1.b[0]\n"
"sdot z27.s, z28.b, z2.b[0]\n"
- "sdot z16.s, z3.b, z0.b[1]\n"
"ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "sdot z16.s, z3.b, z0.b[1]\n"
"sdot z20.s, z3.b, z1.b[1]\n"
- "sdot z24.s, z3.b, z2.b[1]\n"
"ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ "sdot z24.s, z3.b, z2.b[1]\n"
"sdot z17.s, z31.b, z0.b[1]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "add x22, x22, #0x10\n"
"sdot z21.s, z31.b, z1.b[1]\n"
"sdot z25.s, z31.b, z2.b[1]\n"
- "sdot z18.s, z30.b, z0.b[1]\n"
"ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "sdot z18.s, z30.b, z0.b[1]\n"
"sdot z22.s, z30.b, z1.b[1]\n"
"sdot z26.s, z30.b, z2.b[1]\n"
- "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z19.s, z29.b, z0.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z23.s, z29.b, z1.b[1]\n"
"sdot z27.s, z29.b, z2.b[1]\n"
- "sdot z16.s, z28.b, z0.b[2]\n"
"ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "sdot z16.s, z28.b, z0.b[2]\n"
"sdot z20.s, z28.b, z1.b[2]\n"
"sdot z24.s, z28.b, z2.b[2]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z17.s, z5.b, z0.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z21.s, z5.b, z1.b[2]\n"
"sdot z25.s, z5.b, z2.b[2]\n"
"sdot z18.s, z4.b, z0.b[2]\n"
@@ -727,23 +727,23 @@ void sve_hybrid_s8qa_dot_4x4VL (
"bgt 35b\n"
"37:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
- "sdot z16.s, z31.b, z0.b[0]\n"
- "sdot z20.s, z31.b, z1.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z28.b, z0.b[0]\n"
+ "sdot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z28.b, z2.b[0]\n"
"sdot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z21.s, z30.b, z1.b[0]\n"
+ "sdot z25.s, z30.b, z2.b[0]\n"
+ "addvl x28, x28, #4\n"
"sdot z18.s, z29.b, z0.b[0]\n"
"sdot z22.s, z29.b, z1.b[0]\n"
- "sdot z24.s, z31.b, z2.b[0]\n"
- "sdot z25.s, z30.b, z2.b[0]\n"
"sdot z26.s, z29.b, z2.b[0]\n"
"sdot z19.s, z28.b, z0.b[0]\n"
"sdot z23.s, z28.b, z1.b[0]\n"
@@ -752,14 +752,14 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1b { z31.b }, p2/Z, [x28]\n"
"ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z31.b, z0.b[1]\n"
"ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z31.b, z0.b[1]\n"
"sdot z20.s, z31.b, z1.b[1]\n"
"sdot z24.s, z31.b, z2.b[1]\n"
"sdot z17.s, z30.b, z0.b[1]\n"
"sdot z21.s, z30.b, z1.b[1]\n"
+ "addvl x28, x28, #4\n"
"sdot z25.s, z30.b, z2.b[1]\n"
"sdot z18.s, z29.b, z0.b[1]\n"
"sdot z22.s, z29.b, z1.b[1]\n"
@@ -771,14 +771,14 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1b { z31.b }, p2/Z, [x28]\n"
"ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z31.b, z0.b[2]\n"
"ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z31.b, z0.b[2]\n"
"sdot z20.s, z31.b, z1.b[2]\n"
"sdot z24.s, z31.b, z2.b[2]\n"
"sdot z17.s, z30.b, z0.b[2]\n"
"sdot z21.s, z30.b, z1.b[2]\n"
+ "addvl x28, x28, #4\n"
"sdot z25.s, z30.b, z2.b[2]\n"
"sdot z18.s, z29.b, z0.b[2]\n"
"sdot z22.s, z29.b, z1.b[2]\n"
@@ -789,15 +789,15 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ble 38f\n"
"ld1b { z31.b }, p2/Z, [x28]\n"
"ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"sdot z16.s, z31.b, z0.b[3]\n"
"sdot z20.s, z31.b, z1.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z24.s, z31.b, z2.b[3]\n"
"sdot z17.s, z30.b, z0.b[3]\n"
"sdot z21.s, z30.b, z1.b[3]\n"
"sdot z25.s, z30.b, z2.b[3]\n"
+ "addvl x28, x28, #4\n"
"sdot z18.s, z29.b, z0.b[3]\n"
"sdot z22.s, z29.b, z1.b[3]\n"
"sdot z26.s, z29.b, z2.b[3]\n"
@@ -815,22 +815,22 @@ void sve_hybrid_s8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"tbnz %x[flags], #31, 40f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z28.s, p2/M, z28.s\n"
"saddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
"saddv d12, p0, z12.s\n"
"saddv d13, p0, z13.s\n"
- "mov z11.s, z11.s[0]\n"
"mov z12.s, z12.s[0]\n"
+ "mov z13.s, z13.s[0]\n"
+ "neg z28.s, p2/M, z28.s\n"
"mul z11.s, p2/M, z11.s, z28.s\n"
"mul z12.s, p2/M, z12.s, z28.s\n"
- "mov z13.s, z13.s[0]\n"
"mul z13.s, p2/M, z13.s, z28.s\n"
"40:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
@@ -885,18 +885,18 @@ void sve_hybrid_s8qa_dot_4x4VL (
"and z30.d, z18.d, z0.d\n"
"and z29.d, z19.d, z0.d\n"
"and z28.d, z20.d, z0.d\n"
- "and z3.d, z21.d, z0.d\n"
"asr z1.s, z1.s, #0x1f\n"
"asr z31.s, z31.s, #0x1f\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
- "and z2.d, z22.d, z0.d\n"
"sqadd z16.s, z16.s, z1.s\n"
"sqadd z17.s, z17.s, z31.s\n"
"sqadd z18.s, z18.s, z30.s\n"
"sqadd z19.s, z19.s, z29.s\n"
"sqadd z20.s, z20.s, z28.s\n"
+ "and z3.d, z21.d, z0.d\n"
+ "and z2.d, z22.d, z0.d\n"
"and z1.d, z23.d, z0.d\n"
"and z31.d, z24.d, z0.d\n"
"and z30.d, z25.d, z0.d\n"
@@ -918,35 +918,35 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z27.s, z27.s, z28.s\n"
"41:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z30.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z28.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z28.s\n"
+ "add z18.s, z18.s, z28.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z20.s, z20.s, z28.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z30.s\n"
+ "add z21.s, z21.s, z28.s\n"
+ "add z22.s, z22.s, z28.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z30.s\n"
- "add z18.s, z18.s, z30.s\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z24.s, z24.s, z28.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z30.s\n"
- "add z20.s, z20.s, z30.s\n"
+ "add z25.s, z25.s, z28.s\n"
+ "add z26.s, z26.s, z28.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z21.s, z21.s, z30.s\n"
- "add z22.s, z22.s, z30.s\n"
"ld1rw { z29.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z30.s\n"
- "add z24.s, z24.s, z30.s\n"
+ "add z27.s, z27.s, z28.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z30.s\n"
- "add z26.s, z26.s, z30.s\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z30.s\n"
"smin z16.s, p2/M, z16.s, z29.s\n"
"smin z17.s, p2/M, z17.s, z29.s\n"
"smin z18.s, p2/M, z18.s, z29.s\n"
@@ -962,28 +962,28 @@ void sve_hybrid_s8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z28.s\n"
"smax z17.s, p2/M, z17.s, z28.s\n"
"smax z18.s, p2/M, z18.s, z28.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"smax z19.s, p2/M, z19.s, z28.s\n"
"smax z20.s, p2/M, z20.s, z28.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z21.s, p2/M, z21.s, z28.s\n"
"smax z22.s, p2/M, z22.s, z28.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "st1b { z16.b }, p1, [x27]\n"
"smax z23.s, p2/M, z23.s, z28.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
"smax z24.s, p2/M, z24.s, z28.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"smax z25.s, p2/M, z25.s, z28.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
"smax z26.s, p2/M, z26.s, z28.s\n"
- "smax z27.s, p2/M, z27.s, z28.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z18.h, z22.h, z23.h\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "uzp1 z20.b, z20.b, z18.b\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z20.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z28.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x22]\n"
"addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z20.b }, p1, [x24]\n"
- "st1b { z24.b }, p1, [x23]\n"
"42:" // Height 3: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -991,28 +991,27 @@ void sve_hybrid_s8qa_dot_4x4VL (
"b 58f\n"
"43:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"mov z15.b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -1027,8 +1026,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov x26, #0x0\n"
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1053,37 +1052,37 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ble 51f\n"
"49:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
"ld1rqb { z3.b }, p0/Z, [x21]\n"
+ "add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z16.s, z5.b, z0.b[0]\n"
"sdot z20.s, z5.b, z1.b[0]\n"
- "sdot z17.s, z10.b, z0.b[0]\n"
- "sdot z21.s, z10.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z24.s, z5.b, z2.b[0]\n"
"sdot z28.s, z5.b, z3.b[0]\n"
+ "sdot z17.s, z4.b, z0.b[0]\n"
+ "sdot z21.s, z4.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "sdot z25.s, z4.b, z2.b[0]\n"
+ "sdot z29.s, z4.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"addvl x28, x28, #16\n"
- "sdot z25.s, z10.b, z2.b[0]\n"
- "sdot z29.s, z10.b, z3.b[0]\n"
- "sdot z18.s, z4.b, z0.b[0]\n"
- "sdot z22.s, z4.b, z1.b[0]\n"
- "sdot z26.s, z4.b, z2.b[0]\n"
- "sdot z30.s, z4.b, z3.b[0]\n"
"ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
"ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"sdot z19.s, z9.b, z0.b[0]\n"
"sdot z23.s, z9.b, z1.b[0]\n"
"sdot z27.s, z9.b, z2.b[0]\n"
@@ -1153,26 +1152,26 @@ void sve_hybrid_s8qa_dot_4x4VL (
"bgt 49b\n"
"51:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z7.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
"ld1rqb { z3.b }, p0/Z, [x21]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z16.s, z7.b, z0.b[0]\n"
"sdot z20.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z6.b, z0.b[0]\n"
- "sdot z21.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z5.b, z0.b[0]\n"
- "sdot z22.s, z5.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z24.s, z7.b, z2.b[0]\n"
"sdot z28.s, z7.b, z3.b[0]\n"
+ "sdot z17.s, z6.b, z0.b[0]\n"
+ "sdot z21.s, z6.b, z1.b[0]\n"
+ "addvl x28, x28, #4\n"
"sdot z25.s, z6.b, z2.b[0]\n"
"sdot z29.s, z6.b, z3.b[0]\n"
+ "sdot z18.s, z5.b, z0.b[0]\n"
+ "sdot z22.s, z5.b, z1.b[0]\n"
"sdot z26.s, z5.b, z2.b[0]\n"
"sdot z30.s, z5.b, z3.b[0]\n"
"sdot z19.s, z4.b, z0.b[0]\n"
@@ -1183,14 +1182,14 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z7.b, z0.b[1]\n"
"ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z7.b, z0.b[1]\n"
"sdot z20.s, z7.b, z1.b[1]\n"
"sdot z24.s, z7.b, z2.b[1]\n"
"sdot z28.s, z7.b, z3.b[1]\n"
"sdot z17.s, z6.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
"sdot z21.s, z6.b, z1.b[1]\n"
"sdot z25.s, z6.b, z2.b[1]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
@@ -1206,14 +1205,14 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
"ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "sdot z16.s, z7.b, z0.b[2]\n"
"sdot z20.s, z7.b, z1.b[2]\n"
"sdot z24.s, z7.b, z2.b[2]\n"
"sdot z28.s, z7.b, z3.b[2]\n"
"sdot z17.s, z6.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
"sdot z21.s, z6.b, z1.b[2]\n"
"sdot z25.s, z6.b, z2.b[2]\n"
"sdot z29.s, z6.b, z3.b[2]\n"
@@ -1228,15 +1227,15 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ble 52f\n"
"ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"sdot z16.s, z7.b, z0.b[3]\n"
"sdot z20.s, z7.b, z1.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z24.s, z7.b, z2.b[3]\n"
"sdot z28.s, z7.b, z3.b[3]\n"
"sdot z17.s, z6.b, z0.b[3]\n"
"sdot z21.s, z6.b, z1.b[3]\n"
+ "addvl x28, x28, #4\n"
"sdot z25.s, z6.b, z2.b[3]\n"
"sdot z29.s, z6.b, z3.b[3]\n"
"sdot z18.s, z5.b, z0.b[3]\n"
@@ -1259,25 +1258,25 @@ void sve_hybrid_s8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
+ "add x23, x27, x20\n"
"add x22, x23, x20\n"
+ "add x21, x22, x20\n"
"tbnz %x[flags], #31, 54f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z0.s, p2/M, z0.s\n"
"saddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
"saddv d12, p0, z12.s\n"
"saddv d13, p0, z13.s\n"
- "saddv d14, p0, z14.s\n"
- "mov z11.s, z11.s[0]\n"
"mov z12.s, z12.s[0]\n"
- "mul z11.s, p2/M, z11.s, z0.s\n"
- "mul z12.s, p2/M, z12.s, z0.s\n"
"mov z13.s, z13.s[0]\n"
+ "saddv d14, p0, z14.s\n"
+ "neg z0.s, p2/M, z0.s\n"
"mov z14.s, z14.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
+ "mul z12.s, p2/M, z12.s, z0.s\n"
"mul z13.s, p2/M, z13.s, z0.s\n"
"mul z14.s, p2/M, z14.s, z0.s\n"
"54:" // Height 4: skip row sum fixup
@@ -1342,32 +1341,32 @@ void sve_hybrid_s8qa_dot_4x4VL (
"tbz %x[flags], #5, 55f\n"
"and z2.d, z16.d, z0.d\n"
"and z1.d, z17.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z17.s, z17.s, z1.s\n"
"and z7.d, z18.d, z0.d\n"
"and z6.d, z19.d, z0.d\n"
"and z5.d, z20.d, z0.d\n"
"and z4.d, z21.d, z0.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
"and z3.d, z22.d, z0.d\n"
+ "and z2.d, z23.d, z0.d\n"
+ "and z1.d, z24.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z17.s, z17.s, z1.s\n"
- "and z2.d, z23.d, z0.d\n"
- "and z1.d, z24.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"asr z3.s, z3.s, #0x1f\n"
- "sqadd z18.s, z18.s, z7.s\n"
- "sqadd z19.s, z19.s, z6.s\n"
"asr z2.s, z2.s, #0x1f\n"
"asr z1.s, z1.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z7.s\n"
+ "sqadd z19.s, z19.s, z6.s\n"
"sqadd z20.s, z20.s, z5.s\n"
"sqadd z21.s, z21.s, z4.s\n"
"sqadd z22.s, z22.s, z3.s\n"
- "and z7.d, z25.d, z0.d\n"
"sqadd z23.s, z23.s, z2.s\n"
"sqadd z24.s, z24.s, z1.s\n"
+ "and z7.d, z25.d, z0.d\n"
"and z6.d, z26.d, z0.d\n"
"and z5.d, z27.d, z0.d\n"
"and z4.d, z28.d, z0.d\n"
@@ -1390,43 +1389,43 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z31.s, z31.s, z1.s\n"
"55:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"ld1rw { z2.s }, p2/Z, [x20]\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z20.s, z20.s, z2.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z2.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z2.s\n"
- "add z18.s, z18.s, z2.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z20.s, z20.s, z2.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z2.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
- "add z23.s, z23.s, z2.s\n"
- "add z24.s, z24.s, z2.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z30.s, z30.s, z2.s\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z25.s, z25.s, z2.s\n"
- "add z26.s, z26.s, z2.s\n"
"ld1rw { z1.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z2.s\n"
- "add z28.s, z28.s, z2.s\n"
+ "add z31.s, z31.s, z2.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z29.s, z29.s, z2.s\n"
- "add z30.s, z30.s, z2.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z31.s, z31.s, z2.s\n"
"smin z16.s, p2/M, z16.s, z1.s\n"
"smin z17.s, p2/M, z17.s, z1.s\n"
"smin z18.s, p2/M, z18.s, z1.s\n"
@@ -1446,36 +1445,36 @@ void sve_hybrid_s8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z0.s\n"
"smax z17.s, p2/M, z17.s, z0.s\n"
"smax z18.s, p2/M, z18.s, z0.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"smax z19.s, p2/M, z19.s, z0.s\n"
"smax z20.s, p2/M, z20.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z21.s, p2/M, z21.s, z0.s\n"
"smax z22.s, p2/M, z22.s, z0.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "st1b { z16.b }, p1, [x27]\n"
"smax z23.s, p2/M, z23.s, z0.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
"smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"smax z25.s, p2/M, z25.s, z0.s\n"
- "uzp1 z18.h, z18.h, z19.h\n"
"smax z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "st1b { z20.b }, p1, [x23]\n"
"smax z27.s, p2/M, z27.s, z0.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
"smax z28.s, p2/M, z28.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"smax z29.s, p2/M, z29.s, z0.s\n"
- "uzp1 z17.h, z22.h, z23.h\n"
"smax z30.s, p2/M, z30.s, z0.s\n"
- "smax z31.s, p2/M, z31.s, z0.s\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z18.h, z26.h, z27.h\n"
"uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z20.b, z20.b, z17.b\n"
- "uzp1 z17.h, z30.h, z31.h\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z24.b }, p1, [x22]\n"
+ "smax z31.s, p2/M, z31.s, z0.s\n"
+ "uzp1 z16.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z16.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
"addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z18.b\n"
- "uzp1 z28.b, z28.b, z17.b\n"
- "st1b { z20.b }, p1, [x24]\n"
- "st1b { z24.b }, p1, [x23]\n"
- "st1b { z28.b }, p1, [x22]\n"
"56:" // Height 4: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -1492,8 +1491,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
index 96550f4839..4a57f89880 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 8, 8> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 4, 8, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
index b8e65e6999..0d5ea54cb7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
@@ -45,18 +45,18 @@ void sve_hybrid_s8qa_mmla_4x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -91,24 +91,24 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"3:" // Height 1: setup done
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -124,43 +124,43 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ble 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z30.b }, p2/Z, [x28]\n"
- "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "trn1 z0.d, z1.d, z31.d\n"
- ".inst 0x451e9810 // smmla z16.s, z0.b, z30.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199814 // smmla z20.s, z0.b, z25.b\n"
+ ".inst 0x45189811 // smmla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x451a9815 // smmla z21.s, z0.b, z26.b\n"
+ ".inst 0x45199812 // smmla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "trn2 z1.d, z1.d, z31.d\n"
- ".inst 0x451d9814 // smmla z20.s, z0.b, z29.b\n"
- ".inst 0x451c9811 // smmla z17.s, z0.b, z28.b\n"
- ".inst 0x451b9815 // smmla z21.s, z0.b, z27.b\n"
- ".inst 0x451a9812 // smmla z18.s, z0.b, z26.b\n"
- "ld1b { z31.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45199816 // smmla z22.s, z0.b, z25.b\n"
- ".inst 0x45189813 // smmla z19.s, z0.b, z24.b\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x45089817 // smmla z23.s, z0.b, z8.b\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45189816 // smmla z22.s, z0.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x451a9813 // smmla z19.s, z0.b, z26.b\n"
+ ".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x451a9834 // smmla z20.s, z1.b, z26.b\n"
"ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x451f9830 // smmla z16.s, z1.b, z31.b\n"
"ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n"
- ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n"
- ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n"
".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n"
".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n"
".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
+ "add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z11.s, z1.b, z15.b\n"
@@ -170,45 +170,45 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bgt 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
"ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x8\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x451a9814 // smmla z20.s, z0.b, z26.b\n"
"ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
"ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ ".inst 0x45199811 // smmla z17.s, z0.b, z25.b\n"
+ ".inst 0x45189815 // smmla z21.s, z0.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z31.d\n"
- ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #8\n"
- "trn2 z1.d, z1.d, z31.d\n"
- ".inst 0x451e9814 // smmla z20.s, z0.b, z30.b\n"
- ".inst 0x451d9811 // smmla z17.s, z0.b, z29.b\n"
- ".inst 0x451c9815 // smmla z21.s, z0.b, z28.b\n"
".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n"
".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n"
".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n"
".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"ble 10f\n"
"ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45189834 // smmla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199832 // smmla z18.s, z1.b, z25.b\n"
+ ".inst 0x45189836 // smmla z22.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n"
- ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n"
- ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n"
- ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n"
".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -225,32 +225,32 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov z23.d, z16.d\n"
"tbnz %x[flags], #31, 12f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "ld1rw { z9.s }, p2/Z, [x20]\n"
- "neg z9.s, p2/M, z9.s\n"
+ "neg z16.s, p2/M, z16.s\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z9.s\n"
+ "mul z11.s, p2/M, z11.s, z16.s\n"
"12:" // Height 1: skip row sum fixup
"add z23.s, z23.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
"ld1w { z22.s }, p2/Z, [x10]\n"
- "ld1w { z24.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z23.s, z23.s, z22.s\n"
- "add z17.s, z17.s, z24.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z20.s\n"
+ "add z19.s, z19.s, z16.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z21.s\n"
- "add z19.s, z19.s, z20.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n"
+ "addvl x10, x10, #4\n"
".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
"tbz %x[flags], #5, 13f\n"
@@ -268,19 +268,19 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"sqadd z19.s, z19.s, z16.s\n"
"13:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z16.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z21.s\n"
+ "add z19.s, z19.s, z16.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z21.s\n"
- "add z18.s, z18.s, z21.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z21.s\n"
"smin z23.s, p2/M, z23.s, z20.s\n"
"smin z17.s, p2/M, z17.s, z20.s\n"
"smin z18.s, p2/M, z18.s, z20.s\n"
@@ -288,8 +288,8 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"smax z23.s, p2/M, z23.s, z16.s\n"
"smax z17.s, p2/M, z17.s, z16.s\n"
"smax z18.s, p2/M, z18.s, z16.s\n"
- "smax z19.s, p2/M, z19.s, z16.s\n"
"uzp1 z23.h, z23.h, z17.h\n"
+ "smax z19.s, p2/M, z19.s, z16.s\n"
"uzp1 z16.h, z18.h, z19.h\n"
"uzp1 z23.b, z23.b, z16.b\n"
"st1b { z23.b }, p1, [x27]\n"
@@ -307,24 +307,24 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"17:" // Height 2: setup done
"mov x26, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -343,45 +343,45 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ble 23f\n"
"21:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z25.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- ".inst 0x451f9810 // smmla z16.s, z0.b, z31.b\n"
+ "ld1rqb { z26.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199814 // smmla z20.s, z0.b, z25.b\n"
+ ".inst 0x45189811 // smmla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x451a9815 // smmla z21.s, z0.b, z26.b\n"
+ ".inst 0x45199812 // smmla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x451e9814 // smmla z20.s, z0.b, z30.b\n"
- ".inst 0x451d9811 // smmla z17.s, z0.b, z29.b\n"
- ".inst 0x451c9815 // smmla z21.s, z0.b, z28.b\n"
- ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n"
- ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n"
- ".inst 0x45189813 // smmla z19.s, z0.b, z24.b\n"
+ ".inst 0x45189816 // smmla z22.s, z0.b, z24.b\n"
"ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x451a9813 // smmla z19.s, z0.b, z26.b\n"
".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x451a9834 // smmla z20.s, z1.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
"ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n"
- ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n"
- ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n"
".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n"
".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n"
".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"tbnz %x[flags], #31, 22f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z11.s, z1.b, z15.b\n"
@@ -391,46 +391,46 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bgt 21b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z29.b }, p2/Z, [x28]\n"
- "ld1b { z28.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x25, x25, #0x8\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x451a9814 // smmla z20.s, z0.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199811 // smmla z17.s, z0.b, z25.b\n"
+ ".inst 0x45189815 // smmla z21.s, z0.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z24.d\n"
- "trn2 z1.d, z1.d, z24.d\n"
- ".inst 0x451d9810 // smmla z16.s, z0.b, z29.b\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #8\n"
- ".inst 0x451c9814 // smmla z20.s, z0.b, z28.b\n"
- ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n"
- ".inst 0x451b9815 // smmla z21.s, z0.b, z27.b\n"
- ".inst 0x451a9812 // smmla z18.s, z0.b, z26.b\n"
- ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n"
+ ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n"
+ ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n"
".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n"
".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"ble 24f\n"
"ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45189834 // smmla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199832 // smmla z18.s, z1.b, z25.b\n"
+ ".inst 0x45189836 // smmla z22.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n"
- ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n"
- ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n"
- ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n"
".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"24:" // Height 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 25f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -443,18 +443,18 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"uzp1 z24.d, z16.d, z20.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp2 z16.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x23, x27, x20\n"
"mov z23.d, z24.d\n"
"tbnz %x[flags], #31, 26f\n"
"add x20, %x[qp], %[b_offset]\n"
- ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
+ ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
"neg z24.s, p2/M, z24.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
@@ -497,24 +497,24 @@ void sve_hybrid_s8qa_mmla_4x4VL (
".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
"and z24.d, z23.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z24.s\n"
"and z30.d, z20.d, z0.d\n"
"and z29.d, z21.d, z0.d\n"
"and z28.d, z22.d, z0.d\n"
"and z27.d, z16.d, z0.d\n"
"and z26.d, z17.d, z0.d\n"
- "asr z24.s, z24.s, #0x1f\n"
"and z25.d, z18.d, z0.d\n"
+ "and z24.d, z19.d, z0.d\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"asr z27.s, z27.s, #0x1f\n"
- "sqadd z23.s, z23.s, z24.s\n"
- "and z24.d, z19.d, z0.d\n"
"asr z26.s, z26.s, #0x1f\n"
"asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
"sqadd z20.s, z20.s, z30.s\n"
"sqadd z21.s, z21.s, z29.s\n"
- "asr z24.s, z24.s, #0x1f\n"
"sqadd z22.s, z22.s, z28.s\n"
"sqadd z16.s, z16.s, z27.s\n"
"sqadd z17.s, z17.s, z26.s\n"
@@ -522,27 +522,27 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"sqadd z19.s, z19.s, z24.s\n"
"27:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z24.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "add z20.s, z20.s, z24.s\n"
+ "add z21.s, z21.s, z24.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z22.s, z22.s, z24.s\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z23.s, z23.s, z26.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z20.s, z20.s, z26.s\n"
- "add z21.s, z21.s, z26.s\n"
"ld1rw { z25.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z26.s\n"
- "add z16.s, z16.s, z26.s\n"
+ "add z19.s, z19.s, z24.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z26.s\n"
- "add z18.s, z18.s, z26.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z26.s\n"
"smin z23.s, p2/M, z23.s, z25.s\n"
"smin z20.s, p2/M, z20.s, z25.s\n"
"smin z21.s, p2/M, z21.s, z25.s\n"
@@ -554,20 +554,20 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"smax z23.s, p2/M, z23.s, z24.s\n"
"smax z20.s, p2/M, z20.s, z24.s\n"
"smax z21.s, p2/M, z21.s, z24.s\n"
+ "uzp1 z23.h, z23.h, z20.h\n"
"smax z22.s, p2/M, z22.s, z24.s\n"
"smax z16.s, p2/M, z16.s, z24.s\n"
+ "uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z23.b, z23.b, z20.b\n"
"smax z17.s, p2/M, z17.s, z24.s\n"
"smax z18.s, p2/M, z18.s, z24.s\n"
- "smax z19.s, p2/M, z19.s, z24.s\n"
- "uzp1 z23.h, z23.h, z20.h\n"
- "uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "st1b { z23.b }, p1, [x27]\n"
+ "smax z19.s, p2/M, z19.s, z24.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z23.b, z23.b, z20.b\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z23.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
"st1b { z16.b }, p1, [x23]\n"
+ "addvl x27, x27, #1\n"
"28:" // Height 2: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -582,16 +582,16 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -606,8 +606,8 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov x26, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -629,49 +629,49 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ble 37f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x22, x22, #0x10\n"
"trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z6.d\n"
- "trn2 z3.d, z3.d, z6.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45059814 // smmla z20.s, z0.b, z5.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x4505985c // smmla z28.s, z2.b, z5.b\n"
+ ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n"
"ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- ".inst 0x450a9814 // smmla z20.s, z0.b, z10.b\n"
- ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- ".inst 0x45049812 // smmla z18.s, z0.b, z4.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x450a985c // smmla z28.s, z2.b, z10.b\n"
- ".inst 0x45099859 // smmla z25.s, z2.b, z9.b\n"
- ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
- ".inst 0x4504985a // smmla z26.s, z2.b, z4.b\n"
- ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n"
- ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n"
+ ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n"
+ ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n"
"ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n"
+ ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n"
"ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
"ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n"
+ ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n"
"ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
- ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n"
+ ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n"
"ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
+ ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n"
@@ -699,32 +699,32 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bgt 35b\n"
"37:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z5.d\n"
- "trn2 z3.d, z3.d, z5.d\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "trn1 z2.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z4.d\n"
+ ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
+ "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "subs x25, x25, #0x8\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n"
+ ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
- ".inst 0x450a9814 // smmla z20.s, z0.b, z10.b\n"
- ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n"
- ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x450a985c // smmla z28.s, z2.b, z10.b\n"
- "addvl x28, x28, #8\n"
+ ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n"
".inst 0x45099859 // smmla z25.s, z2.b, z9.b\n"
+ ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
+ "addvl x28, x28, #8\n"
+ ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n"
".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n"
".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n"
".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n"
@@ -734,24 +734,24 @@ void sve_hybrid_s8qa_mmla_4x4VL (
".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n"
"ble 38f\n"
"ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45059834 // smmla z20.s, z1.b, z5.b\n"
+ ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n"
"ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
"ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
- ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ ".inst 0x45049831 // smmla z17.s, z1.b, z4.b\n"
+ ".inst 0x45049879 // smmla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n"
- ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n"
- ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n"
- ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n"
- "addvl x28, x28, #8\n"
".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n"
+ "addvl x28, x28, #8\n"
".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n"
".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
@@ -771,15 +771,15 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
+ "add x22, x23, x20\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x27, x20\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x22, x23, x20\n"
"uzp1 z24.d, z24.d, z28.d\n"
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
@@ -787,14 +787,14 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 40f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
"neg z23.s, p2/M, z23.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mov z13.s, z13.s[0]\n"
"mul z11.s, p2/M, z11.s, z23.s\n"
+ "mov z13.s, z13.s[0]\n"
"mul z12.s, p2/M, z12.s, z23.s\n"
"mul z13.s, p2/M, z13.s, z23.s\n"
"40:" // Height 3: skip row sum fixup
@@ -850,18 +850,18 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"and z29.d, z21.d, z0.d\n"
"and z28.d, z22.d, z0.d\n"
"and z23.d, z16.d, z0.d\n"
- "and z3.d, z17.d, z0.d\n"
"asr z1.s, z1.s, #0x1f\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"asr z23.s, z23.s, #0x1f\n"
- "and z2.d, z18.d, z0.d\n"
"sqadd z31.s, z31.s, z1.s\n"
"sqadd z20.s, z20.s, z30.s\n"
"sqadd z21.s, z21.s, z29.s\n"
"sqadd z22.s, z22.s, z28.s\n"
"sqadd z16.s, z16.s, z23.s\n"
+ "and z3.d, z17.d, z0.d\n"
+ "and z2.d, z18.d, z0.d\n"
"and z1.d, z19.d, z0.d\n"
"and z30.d, z24.d, z0.d\n"
"and z29.d, z25.d, z0.d\n"
@@ -883,35 +883,35 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"sqadd z27.s, z27.s, z23.s\n"
"41:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
+ "add z31.s, z31.s, z23.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "add z20.s, z20.s, z23.s\n"
+ "add z21.s, z21.s, z23.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z22.s, z22.s, z23.s\n"
+ "add z16.s, z16.s, z23.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z31.s, z31.s, z29.s\n"
+ "add z17.s, z17.s, z23.s\n"
+ "add z18.s, z18.s, z23.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z20.s, z20.s, z29.s\n"
- "add z21.s, z21.s, z29.s\n"
+ "add z19.s, z19.s, z23.s\n"
+ "add z24.s, z24.s, z23.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z22.s, z22.s, z29.s\n"
- "add z16.s, z16.s, z29.s\n"
+ "add z25.s, z25.s, z23.s\n"
+ "add z26.s, z26.s, z23.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z29.s\n"
- "add z18.s, z18.s, z29.s\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z29.s\n"
- "add z24.s, z24.s, z29.s\n"
+ "add z27.s, z27.s, z23.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z29.s\n"
- "add z26.s, z26.s, z29.s\n"
"ld1rw { z23.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z29.s\n"
"smin z31.s, p2/M, z31.s, z28.s\n"
"smin z20.s, p2/M, z20.s, z28.s\n"
"smin z21.s, p2/M, z21.s, z28.s\n"
@@ -927,28 +927,28 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"smax z31.s, p2/M, z31.s, z23.s\n"
"smax z20.s, p2/M, z20.s, z23.s\n"
"smax z21.s, p2/M, z21.s, z23.s\n"
+ "uzp1 z31.h, z31.h, z20.h\n"
"smax z22.s, p2/M, z22.s, z23.s\n"
"smax z16.s, p2/M, z16.s, z23.s\n"
+ "uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z31.b, z31.b, z20.b\n"
"smax z17.s, p2/M, z17.s, z23.s\n"
"smax z18.s, p2/M, z18.s, z23.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "st1b { z31.b }, p1, [x27]\n"
"smax z19.s, p2/M, z19.s, z23.s\n"
- "uzp1 z31.h, z31.h, z20.h\n"
"smax z24.s, p2/M, z24.s, z23.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z25.s, p2/M, z25.s, z23.s\n"
- "uzp1 z20.h, z21.h, z22.h\n"
"smax z26.s, p2/M, z26.s, z23.s\n"
- "smax z27.s, p2/M, z27.s, z23.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z18.h, z18.h, z19.h\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z31.b, z31.b, z20.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "st1b { z31.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z17.b\n"
"st1b { z16.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z23.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x22]\n"
+ "addvl x27, x27, #1\n"
"42:" // Height 3: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -956,28 +956,27 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"b 58f\n"
"43:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"mov z15.b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -992,8 +991,8 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov x26, #0x0\n"
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1018,56 +1017,56 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ble 51f\n"
"49:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
"trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z5.b }, p0/Z, [x21]\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z6.d\n"
- "trn2 z3.d, z3.d, z6.d\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
+ ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n"
- ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
+ ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x45059811 // smmla z17.s, z0.b, z5.b\n"
+ ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x45049815 // smmla z21.s, z0.b, z4.b\n"
+ ".inst 0x4504985d // smmla z29.s, z2.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
- ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n"
- ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n"
- ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n"
+ ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n"
+ ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n"
+ ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n"
"ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
"ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
+ ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n"
+ ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n"
"ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n"
+ ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n"
+ "add x22, x22, #0x10\n"
".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n"
".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n"
+ "add x21, x21, #0x10\n"
".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n"
".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
@@ -1090,60 +1089,60 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bgt 49b\n"
"51:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z2.d\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
- "trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
"trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
"trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
+ ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "subs x25, x25, #0x8\n"
+ ".inst 0x45059814 // smmla z20.s, z0.b, z5.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x4505985c // smmla z28.s, z2.b, z5.b\n"
+ ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45069810 // smmla z16.s, z0.b, z6.b\n"
- ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n"
- ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n"
- ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
+ ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
+ ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
- ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n"
- ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n"
+ ".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n"
+ ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n"
+ ".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n"
".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n"
".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n"
- ".inst 0x45069817 // smmla z23.s, z0.b, z6.b\n"
- ".inst 0x4506985f // smmla z31.s, z2.b, z6.b\n"
+ ".inst 0x45049817 // smmla z23.s, z0.b, z4.b\n"
+ ".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n"
"ble 52f\n"
"ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45059834 // smmla z20.s, z1.b, z5.b\n"
+ ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n"
"ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
"ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
- ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ ".inst 0x45049831 // smmla z17.s, z1.b, z4.b\n"
+ ".inst 0x45049879 // smmla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n"
- ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n"
- ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n"
- ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n"
- "addvl x28, x28, #8\n"
".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n"
+ "addvl x28, x28, #8\n"
".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n"
".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
@@ -1163,16 +1162,16 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
+ "add x21, x22, x20\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x27, x20\n"
- "add x22, x23, x20\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x21, x22, x20\n"
"uzp1 z23.d, z24.d, z28.d\n"
"uzp2 z24.d, z24.d, z28.d\n"
"uzp1 z28.d, z25.d, z29.d\n"
@@ -1184,15 +1183,15 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 54f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
"neg z0.s, p2/M, z0.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
"mov z14.s, z13.s[3]\n"
"mov z13.s, z13.s[0]\n"
- "mul z11.s, p2/M, z11.s, z0.s\n"
"mul z12.s, p2/M, z12.s, z0.s\n"
"mul z13.s, p2/M, z13.s, z0.s\n"
"mul z14.s, p2/M, z14.s, z0.s\n"
@@ -1258,32 +1257,32 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"tbz %x[flags], #5, 55f\n"
"and z2.d, z31.d, z0.d\n"
"and z1.d, z20.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z2.s\n"
+ "sqadd z20.s, z20.s, z1.s\n"
"and z7.d, z21.d, z0.d\n"
"and z6.d, z22.d, z0.d\n"
"and z5.d, z16.d, z0.d\n"
"and z4.d, z17.d, z0.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
"and z3.d, z18.d, z0.d\n"
+ "and z2.d, z19.d, z0.d\n"
+ "and z1.d, z23.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z31.s, z31.s, z2.s\n"
- "sqadd z20.s, z20.s, z1.s\n"
- "and z2.d, z19.d, z0.d\n"
- "and z1.d, z23.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"asr z3.s, z3.s, #0x1f\n"
- "sqadd z21.s, z21.s, z7.s\n"
- "sqadd z22.s, z22.s, z6.s\n"
"asr z2.s, z2.s, #0x1f\n"
"asr z1.s, z1.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z7.s\n"
+ "sqadd z22.s, z22.s, z6.s\n"
"sqadd z16.s, z16.s, z5.s\n"
"sqadd z17.s, z17.s, z4.s\n"
"sqadd z18.s, z18.s, z3.s\n"
- "and z7.d, z28.d, z0.d\n"
"sqadd z19.s, z19.s, z2.s\n"
"sqadd z23.s, z23.s, z1.s\n"
+ "and z7.d, z28.d, z0.d\n"
"and z6.d, z29.d, z0.d\n"
"and z5.d, z30.d, z0.d\n"
"and z4.d, z24.d, z0.d\n"
@@ -1306,43 +1305,43 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"sqadd z27.s, z27.s, z1.s\n"
"55:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"ld1rw { z2.s }, p2/Z, [x20]\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "add z31.s, z31.s, z2.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "add z20.s, z20.s, z2.s\n"
+ "add z21.s, z21.s, z2.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z31.s, z31.s, z2.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z20.s, z20.s, z2.s\n"
- "add z21.s, z21.s, z2.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z23.s, z23.s, z2.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z16.s, z16.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "add z29.s, z29.s, z2.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z2.s\n"
- "add z18.s, z18.s, z2.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z23.s, z23.s, z2.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z28.s, z28.s, z2.s\n"
- "add z29.s, z29.s, z2.s\n"
"ld1rw { z1.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z2.s\n"
- "add z24.s, z24.s, z2.s\n"
+ "add z27.s, z27.s, z2.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z2.s\n"
- "add z26.s, z26.s, z2.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z2.s\n"
"smin z31.s, p2/M, z31.s, z1.s\n"
"smin z20.s, p2/M, z20.s, z1.s\n"
"smin z21.s, p2/M, z21.s, z1.s\n"
@@ -1362,36 +1361,36 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"smax z31.s, p2/M, z31.s, z0.s\n"
"smax z20.s, p2/M, z20.s, z0.s\n"
"smax z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z31.h, z31.h, z20.h\n"
"smax z22.s, p2/M, z22.s, z0.s\n"
"smax z16.s, p2/M, z16.s, z0.s\n"
+ "uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z31.b, z31.b, z20.b\n"
"smax z17.s, p2/M, z17.s, z0.s\n"
"smax z18.s, p2/M, z18.s, z0.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "st1b { z31.b }, p1, [x27]\n"
"smax z19.s, p2/M, z19.s, z0.s\n"
- "uzp1 z31.h, z31.h, z20.h\n"
"smax z23.s, p2/M, z23.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z28.s, p2/M, z28.s, z0.s\n"
- "uzp1 z20.h, z21.h, z22.h\n"
"smax z29.s, p2/M, z29.s, z0.s\n"
+ "uzp1 z23.h, z23.h, z28.h\n"
+ "st1b { z16.b }, p1, [x23]\n"
"smax z30.s, p2/M, z30.s, z0.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
"smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z29.h, z30.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
"smax z25.s, p2/M, z25.s, z0.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
"smax z26.s, p2/M, z26.s, z0.s\n"
- "smax z27.s, p2/M, z27.s, z0.s\n"
- "uzp1 z23.h, z23.h, z28.h\n"
- "uzp1 z31.b, z31.b, z20.b\n"
- "uzp1 z18.h, z29.h, z30.h\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "st1b { z31.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z23.b, z23.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z16.b }, p1, [x23]\n"
"st1b { z23.b }, p1, [x22]\n"
+ "smax z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x21]\n"
+ "addvl x27, x27, #1\n"
"56:" // Height 4: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -1408,8 +1407,8 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
index bd34f29894..6116e0cefb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
index 49930b57f7..b3fb963111 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -47,18 +47,18 @@ void sve_hybrid_s8qs_dot_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -98,22 +98,22 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"3:" // Height 1: setup done
"mov x28, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -129,103 +129,103 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ble 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "sdot z8.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[0]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z11.s, z16.b, z0.b[0]\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
- "sdot z8.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "sdot z8.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[1]\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "sdot z10.s, z17.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z11.s, z16.b, z0.b[1]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[2]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n"
"sdot z11.s, z16.b, z0.b[2]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[3]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
"sdot z10.s, z17.b, z0.b[3]\n"
"sdot z11.s, z16.b, z0.b[3]\n"
+ "add x26, x26, #0x10\n"
"bgt 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "sdot z8.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[0]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z17.b, z0.b[0]\n"
"sdot z11.s, z16.b, z0.b[0]\n"
+ "addvl x9, x9, #4\n"
"ble 9f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[1]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[1]\n"
"sdot z11.s, z16.b, z0.b[1]\n"
+ "addvl x9, x9, #4\n"
"ble 9f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[2]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[2]\n"
"sdot z11.s, z16.b, z0.b[2]\n"
+ "addvl x9, x9, #4\n"
"ble 9f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[3]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z17.b, z0.b[3]\n"
"sdot z11.s, z16.b, z0.b[3]\n"
+ "addvl x9, x9, #4\n"
"9:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 4b\n"
- "ld1w { z19.s }, p2/Z, [x14]\n"
- "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "add z8.s, z8.s, z17.s\n"
+ "add z9.s, z9.s, z16.s\n"
"ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
"ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add z8.s, z8.s, z19.s\n"
- "add z9.s, z9.s, z18.s\n"
"add z10.s, z10.s, z17.s\n"
"add z11.s, z11.s, z16.s\n"
+ "addvl x14, x14, #4\n"
"tbz %x[flags], #4, 10f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -269,19 +269,19 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z11.s, z11.s, z16.s\n"
"12:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
+ "add z8.s, z8.s, z16.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z16.s\n"
+ "add z10.s, z10.s, z16.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
"ld1rw { z17.s }, p2/Z, [x20]\n"
- "add z8.s, z8.s, z18.s\n"
+ "add z11.s, z11.s, z16.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z9.s, z9.s, z18.s\n"
- "add z10.s, z10.s, z18.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z11.s, z11.s, z18.s\n"
"smin z8.s, p2/M, z8.s, z17.s\n"
"smin z9.s, p2/M, z9.s, z17.s\n"
"smin z10.s, p2/M, z10.s, z17.s\n"
@@ -289,41 +289,41 @@ void sve_hybrid_s8qs_dot_6x4VL (
"smax z8.s, p2/M, z8.s, z16.s\n"
"smax z9.s, p2/M, z9.s, z16.s\n"
"smax z10.s, p2/M, z10.s, z16.s\n"
- "smax z11.s, p2/M, z11.s, z16.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
+ "smax z11.s, p2/M, z11.s, z16.s\n"
"uzp1 z16.h, z10.h, z11.h\n"
"uzp1 z8.b, z8.b, z16.b\n"
- "st1b { z8.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
+ "st1b { z8.b }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
"13:" // Height 1: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"16:" // Height 2: setup done
"mov x28, #0x0\n"
"17:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 18f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -342,57 +342,57 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ble 21f\n"
"20:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z1.b[0]\n"
"sdot z12.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z1.b[0]\n"
"sdot z13.s, z16.b, z0.b[0]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z17.b, z1.b[0]\n"
"sdot z14.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "cmp x27, #0x10\n"
"sdot z11.s, z16.b, z1.b[0]\n"
"sdot z15.s, z16.b, z0.b[0]\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"sdot z8.s, z17.b, z1.b[1]\n"
"sdot z12.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z16.b, z1.b[1]\n"
"sdot z13.s, z16.b, z0.b[1]\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z17.b, z1.b[1]\n"
"sdot z14.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z16.b, z1.b[1]\n"
"sdot z15.s, z16.b, z0.b[1]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z17.b, z1.b[2]\n"
"sdot z12.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z16.b, z1.b[2]\n"
"sdot z13.s, z16.b, z0.b[2]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z17.b, z1.b[2]\n"
"sdot z14.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z16.b, z1.b[2]\n"
"sdot z15.s, z16.b, z0.b[2]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z17.b, z1.b[3]\n"
"sdot z12.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z16.b, z1.b[3]\n"
"sdot z13.s, z16.b, z0.b[3]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z17.b, z1.b[3]\n"
"sdot z14.s, z17.b, z0.b[3]\n"
"sdot z11.s, z16.b, z1.b[3]\n"
@@ -400,64 +400,64 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bgt 20b\n"
"21:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[0]\n"
"sdot z12.s, z17.b, z1.b[0]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[0]\n"
"sdot z13.s, z16.b, z1.b[0]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z17.b, z0.b[0]\n"
"sdot z14.s, z17.b, z1.b[0]\n"
+ "addvl x9, x9, #4\n"
"sdot z11.s, z16.b, z0.b[0]\n"
"sdot z15.s, z16.b, z1.b[0]\n"
"ble 22f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[1]\n"
"sdot z12.s, z17.b, z1.b[1]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[1]\n"
"sdot z13.s, z16.b, z1.b[1]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[1]\n"
"sdot z14.s, z17.b, z1.b[1]\n"
+ "addvl x9, x9, #4\n"
"sdot z11.s, z16.b, z0.b[1]\n"
"sdot z15.s, z16.b, z1.b[1]\n"
"ble 22f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[2]\n"
"sdot z12.s, z17.b, z1.b[2]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[2]\n"
"sdot z13.s, z16.b, z1.b[2]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[2]\n"
"sdot z14.s, z17.b, z1.b[2]\n"
+ "addvl x9, x9, #4\n"
"sdot z11.s, z16.b, z0.b[2]\n"
"sdot z15.s, z16.b, z1.b[2]\n"
"ble 22f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[3]\n"
"sdot z12.s, z17.b, z1.b[3]\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[3]\n"
"sdot z13.s, z16.b, z1.b[3]\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z17.b, z0.b[3]\n"
"sdot z14.s, z17.b, z1.b[3]\n"
+ "addvl x9, x9, #4\n"
"sdot z11.s, z16.b, z0.b[3]\n"
"sdot z15.s, z16.b, z1.b[3]\n"
"22:" // Height 2: Multiply loop: multiply skip
@@ -467,17 +467,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bne 17b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"ld1w { z19.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "add z8.s, z8.s, z19.s\n"
"ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
"ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add x26, x9, x20\n"
- "add z8.s, z8.s, z19.s\n"
- "add z12.s, z12.s, z19.s\n"
"add z9.s, z9.s, z18.s\n"
"add z10.s, z10.s, z17.s\n"
- "add z13.s, z13.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
"add z11.s, z11.s, z16.s\n"
+ "add z12.s, z12.s, z19.s\n"
+ "addvl x14, x14, #4\n"
+ "add z13.s, z13.s, z18.s\n"
"add z14.s, z14.s, z17.s\n"
"add z15.s, z15.s, z16.s\n"
"tbz %x[flags], #4, 23f\n"
@@ -522,11 +522,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
"sqadd z8.s, z8.s, z19.s\n"
- "and z19.d, z12.d, z0.d\n"
"sqadd z9.s, z9.s, z18.s\n"
- "and z18.d, z13.d, z1.d\n"
"sqadd z10.s, z10.s, z17.s\n"
"sqadd z11.s, z11.s, z16.s\n"
+ "and z19.d, z12.d, z0.d\n"
+ "and z18.d, z13.d, z1.d\n"
"and z17.d, z14.d, z2.d\n"
"and z16.d, z15.d, z3.d\n"
"asr z19.s, z19.s, #0x1f\n"
@@ -539,73 +539,73 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z15.s, z15.s, z16.s\n"
"25:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
+ "add z8.s, z8.s, z17.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z17.s\n"
+ "add z10.s, z10.s, z17.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z11.s, z11.s, z17.s\n"
+ "add z12.s, z12.s, z17.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z8.s, z8.s, z18.s\n"
+ "add z13.s, z13.s, z17.s\n"
+ "add z14.s, z14.s, z17.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z9.s, z9.s, z18.s\n"
- "add z10.s, z10.s, z18.s\n"
- "ld1rw { z17.s }, p2/Z, [x20]\n"
- "add z11.s, z11.s, z18.s\n"
- "add z12.s, z12.s, z18.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z13.s, z13.s, z18.s\n"
- "add z14.s, z14.s, z18.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z15.s, z15.s, z18.s\n"
- "smin z8.s, p2/M, z8.s, z17.s\n"
- "smin z9.s, p2/M, z9.s, z17.s\n"
- "smin z10.s, p2/M, z10.s, z17.s\n"
- "smin z11.s, p2/M, z11.s, z17.s\n"
- "smin z12.s, p2/M, z12.s, z17.s\n"
- "smin z13.s, p2/M, z13.s, z17.s\n"
- "smin z14.s, p2/M, z14.s, z17.s\n"
- "smin z15.s, p2/M, z15.s, z17.s\n"
- "smax z8.s, p2/M, z8.s, z16.s\n"
- "smax z9.s, p2/M, z9.s, z16.s\n"
- "smax z10.s, p2/M, z10.s, z16.s\n"
- "smax z11.s, p2/M, z11.s, z16.s\n"
- "smax z12.s, p2/M, z12.s, z16.s\n"
- "smax z13.s, p2/M, z13.s, z16.s\n"
- "smax z14.s, p2/M, z14.s, z16.s\n"
- "smax z15.s, p2/M, z15.s, z16.s\n"
+ "add z15.s, z15.s, z17.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z16.s\n"
+ "smin z9.s, p2/M, z9.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z16.s\n"
+ "smin z11.s, p2/M, z11.s, z16.s\n"
+ "smin z12.s, p2/M, z12.s, z16.s\n"
+ "smin z13.s, p2/M, z13.s, z16.s\n"
+ "smin z14.s, p2/M, z14.s, z16.s\n"
+ "smin z15.s, p2/M, z15.s, z16.s\n"
+ "smax z8.s, p2/M, z8.s, z17.s\n"
+ "smax z9.s, p2/M, z9.s, z17.s\n"
+ "smax z10.s, p2/M, z10.s, z17.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "uzp1 z17.h, z10.h, z11.h\n"
+ "smax z11.s, p2/M, z11.s, z17.s\n"
+ "smax z12.s, p2/M, z12.s, z17.s\n"
+ "uzp1 z16.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z16.b\n"
+ "smax z13.s, p2/M, z13.s, z17.s\n"
+ "smax z14.s, p2/M, z14.s, z17.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
+ "st1b { z8.b }, p1, [x11]\n"
+ "smax z15.s, p2/M, z15.s, z17.s\n"
"uzp1 z16.h, z14.h, z15.h\n"
- "uzp1 z8.b, z8.b, z17.b\n"
"uzp1 z12.b, z12.b, z16.b\n"
- "st1b { z8.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
"st1b { z12.b }, p1, [x26]\n"
+ "addvl x11, x11, #1\n"
"26:" // Height 2: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -616,8 +616,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov x28, #0x0\n"
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -639,73 +639,73 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ble 34f\n"
"33:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z21.b }, p2/Z, [x10]\n"
- "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
"sdot z8.s, z21.b, z2.b[0]\n"
"sdot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z0.b[0]\n"
"sdot z9.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[0]\n"
- "sdot z16.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z17.s, z20.b, z0.b[0]\n"
- "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "cmp x27, #0x10\n"
"sdot z10.s, z21.b, z2.b[0]\n"
"sdot z14.s, z21.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"sdot z18.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p2/Z, [x10, #4, MUL VL]\n"
"sdot z11.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"sdot z15.s, z20.b, z1.b[0]\n"
"sdot z19.s, z20.b, z0.b[0]\n"
- "ld1b { z20.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z21.b, z2.b[1]\n"
"sdot z12.s, z21.b, z1.b[1]\n"
"sdot z16.s, z21.b, z0.b[1]\n"
- "ld1b { z21.b }, p2/Z, [x10, #6, MUL VL]\n"
"sdot z9.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #6, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[1]\n"
"sdot z17.s, z20.b, z0.b[1]\n"
- "ld1b { z20.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z20.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z21.b, z2.b[1]\n"
"sdot z14.s, z21.b, z1.b[1]\n"
"sdot z18.s, z21.b, z0.b[1]\n"
"sdot z11.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-8, MUL VL]\n"
"sdot z15.s, z20.b, z1.b[1]\n"
"sdot z19.s, z20.b, z0.b[1]\n"
- "ld1b { z21.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z21.b, z2.b[2]\n"
"sdot z12.s, z21.b, z1.b[2]\n"
"sdot z16.s, z21.b, z0.b[2]\n"
- "ld1b { z21.b }, p2/Z, [x10, #-6, MUL VL]\n"
"sdot z9.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-6, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[2]\n"
"sdot z17.s, z20.b, z0.b[2]\n"
- "ld1b { z20.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z21.b, z2.b[2]\n"
"sdot z14.s, z21.b, z1.b[2]\n"
"sdot z18.s, z21.b, z0.b[2]\n"
- "ld1b { z21.b }, p2/Z, [x10, #-4, MUL VL]\n"
"sdot z11.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-4, MUL VL]\n"
"sdot z15.s, z20.b, z1.b[2]\n"
"sdot z19.s, z20.b, z0.b[2]\n"
- "ld1b { z20.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z21.b, z2.b[3]\n"
"sdot z12.s, z21.b, z1.b[3]\n"
"sdot z16.s, z21.b, z0.b[3]\n"
- "ld1b { z21.b }, p2/Z, [x10, #-2, MUL VL]\n"
"sdot z9.s, z20.b, z2.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-2, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[3]\n"
"sdot z17.s, z20.b, z0.b[3]\n"
- "ld1b { z20.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z21.b, z2.b[3]\n"
"sdot z14.s, z21.b, z1.b[3]\n"
"sdot z18.s, z21.b, z0.b[3]\n"
@@ -715,21 +715,21 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z21.b }, p2/Z, [x10]\n"
- "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
"sdot z8.s, z21.b, z0.b[0]\n"
"sdot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z2.b[0]\n"
"sdot z9.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[0]\n"
- "sdot z16.s, z21.b, z2.b[0]\n"
- "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z17.s, z20.b, z2.b[0]\n"
- "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z21.b, z0.b[0]\n"
"sdot z14.s, z21.b, z1.b[0]\n"
"sdot z18.s, z21.b, z2.b[0]\n"
@@ -737,18 +737,18 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z20.b, z1.b[0]\n"
"sdot z19.s, z20.b, z2.b[0]\n"
"ble 35f\n"
- "ld1b { z21.b }, p2/Z, [x10]\n"
- "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z21.b, z0.b[1]\n"
"sdot z12.s, z21.b, z1.b[1]\n"
"sdot z16.s, z21.b, z2.b[1]\n"
- "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z13.s, z20.b, z1.b[1]\n"
"sdot z17.s, z20.b, z2.b[1]\n"
- "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z21.b, z0.b[1]\n"
"sdot z14.s, z21.b, z1.b[1]\n"
"sdot z18.s, z21.b, z2.b[1]\n"
@@ -756,18 +756,18 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z20.b, z1.b[1]\n"
"sdot z19.s, z20.b, z2.b[1]\n"
"ble 35f\n"
- "ld1b { z21.b }, p2/Z, [x10]\n"
- "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z21.b, z0.b[2]\n"
"sdot z12.s, z21.b, z1.b[2]\n"
"sdot z16.s, z21.b, z2.b[2]\n"
- "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z13.s, z20.b, z1.b[2]\n"
"sdot z17.s, z20.b, z2.b[2]\n"
- "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z21.b, z0.b[2]\n"
"sdot z14.s, z21.b, z1.b[2]\n"
"sdot z18.s, z21.b, z2.b[2]\n"
@@ -775,17 +775,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z20.b, z1.b[2]\n"
"sdot z19.s, z20.b, z2.b[2]\n"
"ble 35f\n"
- "ld1b { z21.b }, p2/Z, [x10]\n"
- "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z21.b, z0.b[3]\n"
"sdot z12.s, z21.b, z1.b[3]\n"
"sdot z16.s, z21.b, z2.b[3]\n"
- "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[3]\n"
"sdot z17.s, z20.b, z2.b[3]\n"
- "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z21.b, z0.b[3]\n"
"sdot z14.s, z21.b, z1.b[3]\n"
"sdot z18.s, z21.b, z2.b[3]\n"
@@ -799,17 +799,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"ld1w { z23.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "add x25, x26, x20\n"
"ld1w { z22.s }, p2/Z, [x14, #1, MUL VL]\n"
"ld1w { z21.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
"add z8.s, z8.s, z23.s\n"
- "add z12.s, z12.s, z23.s\n"
"add z9.s, z9.s, z22.s\n"
+ "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n"
"add z10.s, z10.s, z21.s\n"
"add z11.s, z11.s, z20.s\n"
+ "addvl x14, x14, #4\n"
+ "add z12.s, z12.s, z23.s\n"
"add z13.s, z13.s, z22.s\n"
"add z14.s, z14.s, z21.s\n"
"add z15.s, z15.s, z20.s\n"
@@ -863,11 +863,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z21.s, z21.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
"sqadd z8.s, z8.s, z23.s\n"
- "and z23.d, z12.d, z0.d\n"
"sqadd z9.s, z9.s, z22.s\n"
- "and z22.d, z13.d, z1.d\n"
"sqadd z10.s, z10.s, z21.s\n"
"sqadd z11.s, z11.s, z20.s\n"
+ "and z23.d, z12.d, z0.d\n"
+ "and z22.d, z13.d, z1.d\n"
"and z21.d, z14.d, z2.d\n"
"and z20.d, z15.d, z3.d\n"
"asr z23.s, z23.s, #0x1f\n"
@@ -875,11 +875,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z21.s, z21.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
"sqadd z12.s, z12.s, z23.s\n"
- "and z23.d, z16.d, z0.d\n"
"sqadd z13.s, z13.s, z22.s\n"
- "and z22.d, z17.d, z1.d\n"
"sqadd z14.s, z14.s, z21.s\n"
"sqadd z15.s, z15.s, z20.s\n"
+ "and z23.d, z16.d, z0.d\n"
+ "and z22.d, z17.d, z1.d\n"
"and z21.d, z18.d, z2.d\n"
"and z20.d, z19.d, z3.d\n"
"asr z23.s, z23.s, #0x1f\n"
@@ -892,93 +892,93 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z19.s, z19.s, z20.s\n"
"38:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "ld1rw { z22.s }, p2/Z, [x20]\n"
+ "add z8.s, z8.s, z21.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z21.s\n"
+ "add z10.s, z10.s, z21.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z11.s, z11.s, z21.s\n"
+ "add z12.s, z12.s, z21.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z8.s, z8.s, z22.s\n"
+ "add z13.s, z13.s, z21.s\n"
+ "add z14.s, z14.s, z21.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z22.s\n"
- "add z10.s, z10.s, z22.s\n"
+ "add z15.s, z15.s, z21.s\n"
+ "add z16.s, z16.s, z21.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z22.s\n"
- "add z12.s, z12.s, z22.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z21.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z13.s, z13.s, z22.s\n"
- "add z14.s, z14.s, z22.s\n"
- "ld1rw { z21.s }, p2/Z, [x20]\n"
- "add z15.s, z15.s, z22.s\n"
- "add z16.s, z16.s, z22.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z22.s\n"
- "add z18.s, z18.s, z22.s\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z22.s\n"
- "smin z8.s, p2/M, z8.s, z21.s\n"
- "smin z9.s, p2/M, z9.s, z21.s\n"
- "smin z10.s, p2/M, z10.s, z21.s\n"
- "smin z11.s, p2/M, z11.s, z21.s\n"
- "smin z12.s, p2/M, z12.s, z21.s\n"
- "smin z13.s, p2/M, z13.s, z21.s\n"
- "smin z14.s, p2/M, z14.s, z21.s\n"
- "smin z15.s, p2/M, z15.s, z21.s\n"
- "smin z16.s, p2/M, z16.s, z21.s\n"
- "smin z17.s, p2/M, z17.s, z21.s\n"
- "smin z18.s, p2/M, z18.s, z21.s\n"
- "smin z19.s, p2/M, z19.s, z21.s\n"
- "smax z8.s, p2/M, z8.s, z20.s\n"
- "smax z9.s, p2/M, z9.s, z20.s\n"
- "smax z10.s, p2/M, z10.s, z20.s\n"
- "smax z11.s, p2/M, z11.s, z20.s\n"
- "smax z12.s, p2/M, z12.s, z20.s\n"
- "smax z13.s, p2/M, z13.s, z20.s\n"
- "smax z14.s, p2/M, z14.s, z20.s\n"
- "smax z15.s, p2/M, z15.s, z20.s\n"
+ "add z19.s, z19.s, z21.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z20.s\n"
+ "smin z9.s, p2/M, z9.s, z20.s\n"
+ "smin z10.s, p2/M, z10.s, z20.s\n"
+ "smin z11.s, p2/M, z11.s, z20.s\n"
+ "smin z12.s, p2/M, z12.s, z20.s\n"
+ "smin z13.s, p2/M, z13.s, z20.s\n"
+ "smin z14.s, p2/M, z14.s, z20.s\n"
+ "smin z15.s, p2/M, z15.s, z20.s\n"
+ "smin z16.s, p2/M, z16.s, z20.s\n"
+ "smin z17.s, p2/M, z17.s, z20.s\n"
+ "smin z18.s, p2/M, z18.s, z20.s\n"
+ "smin z19.s, p2/M, z19.s, z20.s\n"
+ "smax z8.s, p2/M, z8.s, z21.s\n"
+ "smax z9.s, p2/M, z9.s, z21.s\n"
+ "smax z10.s, p2/M, z10.s, z21.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z20.s\n"
- "smax z17.s, p2/M, z17.s, z20.s\n"
- "uzp1 z21.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z20.s\n"
- "smax z19.s, p2/M, z19.s, z20.s\n"
+ "smax z11.s, p2/M, z11.s, z21.s\n"
+ "smax z12.s, p2/M, z12.s, z21.s\n"
+ "uzp1 z20.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z20.b\n"
+ "smax z13.s, p2/M, z13.s, z21.s\n"
+ "smax z14.s, p2/M, z14.s, z21.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
+ "st1b { z8.b }, p1, [x11]\n"
+ "smax z15.s, p2/M, z15.s, z21.s\n"
+ "smax z16.s, p2/M, z16.s, z21.s\n"
"uzp1 z20.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z20.b\n"
+ "smax z17.s, p2/M, z17.s, z21.s\n"
+ "smax z18.s, p2/M, z18.s, z21.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z21.b\n"
+ "st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z21.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z12.b, z12.b, z20.b\n"
- "st1b { z8.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z12.b }, p1, [x26]\n"
"st1b { z16.b }, p1, [x25]\n"
+ "addvl x11, x11, #1\n"
"39:" // Height 3: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -993,8 +993,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov x28, #0x0\n"
"43:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 44f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1019,89 +1019,89 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ble 47f\n"
"46:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z25.b, z3.b[0]\n"
"sdot z12.s, z25.b, z2.b[0]\n"
- "sdot z9.s, z24.b, z3.b[0]\n"
- "sdot z13.s, z24.b, z2.b[0]\n"
"sdot z16.s, z25.b, z1.b[0]\n"
"sdot z20.s, z25.b, z0.b[0]\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ "sdot z9.s, z24.b, z3.b[0]\n"
+ "sdot z13.s, z24.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z24.b, z1.b[0]\n"
"sdot z21.s, z24.b, z0.b[0]\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z25.b, z3.b[0]\n"
"sdot z14.s, z25.b, z2.b[0]\n"
"sdot z18.s, z25.b, z1.b[0]\n"
"sdot z22.s, z25.b, z0.b[0]\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z24.b, z3.b[0]\n"
"sdot z15.s, z24.b, z2.b[0]\n"
"sdot z19.s, z24.b, z1.b[0]\n"
"sdot z23.s, z24.b, z0.b[0]\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z25.b, z3.b[1]\n"
"sdot z12.s, z25.b, z2.b[1]\n"
"sdot z16.s, z25.b, z1.b[1]\n"
"sdot z20.s, z25.b, z0.b[1]\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z24.b, z3.b[1]\n"
"sdot z13.s, z24.b, z2.b[1]\n"
"sdot z17.s, z24.b, z1.b[1]\n"
"sdot z21.s, z24.b, z0.b[1]\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z25.b, z3.b[1]\n"
"sdot z14.s, z25.b, z2.b[1]\n"
"sdot z18.s, z25.b, z1.b[1]\n"
"sdot z22.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z24.b, z3.b[1]\n"
"sdot z15.s, z24.b, z2.b[1]\n"
- "ld1b { z25.b }, p2/Z, [x10, #-8, MUL VL]\n"
"sdot z19.s, z24.b, z1.b[1]\n"
"sdot z23.s, z24.b, z0.b[1]\n"
- "ld1b { z24.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z25.b, z3.b[2]\n"
"sdot z12.s, z25.b, z2.b[2]\n"
"sdot z16.s, z25.b, z1.b[2]\n"
"sdot z20.s, z25.b, z0.b[2]\n"
- "ld1b { z25.b }, p2/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z24.b, z3.b[2]\n"
"sdot z13.s, z24.b, z2.b[2]\n"
"sdot z17.s, z24.b, z1.b[2]\n"
"sdot z21.s, z24.b, z0.b[2]\n"
- "ld1b { z24.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z25.b, z3.b[2]\n"
"sdot z14.s, z25.b, z2.b[2]\n"
"sdot z18.s, z25.b, z1.b[2]\n"
"sdot z22.s, z25.b, z0.b[2]\n"
- "ld1b { z25.b }, p2/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z24.b, z3.b[2]\n"
"sdot z15.s, z24.b, z2.b[2]\n"
"sdot z19.s, z24.b, z1.b[2]\n"
"sdot z23.s, z24.b, z0.b[2]\n"
- "ld1b { z24.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z25.b, z3.b[3]\n"
"sdot z12.s, z25.b, z2.b[3]\n"
"sdot z16.s, z25.b, z1.b[3]\n"
"sdot z20.s, z25.b, z0.b[3]\n"
- "ld1b { z25.b }, p2/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z24.b, z3.b[3]\n"
"sdot z13.s, z24.b, z2.b[3]\n"
"sdot z17.s, z24.b, z1.b[3]\n"
"sdot z21.s, z24.b, z0.b[3]\n"
- "ld1b { z24.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z25.b, z3.b[3]\n"
"sdot z14.s, z25.b, z2.b[3]\n"
"sdot z18.s, z25.b, z1.b[3]\n"
@@ -1113,24 +1113,24 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bgt 46b\n"
"47:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z25.b, z0.b[0]\n"
"sdot z12.s, z25.b, z1.b[0]\n"
- "sdot z9.s, z24.b, z0.b[0]\n"
- "sdot z13.s, z24.b, z1.b[0]\n"
"sdot z16.s, z25.b, z2.b[0]\n"
"sdot z20.s, z25.b, z3.b[0]\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[0]\n"
+ "sdot z13.s, z24.b, z1.b[0]\n"
"sdot z17.s, z24.b, z2.b[0]\n"
"sdot z21.s, z24.b, z3.b[0]\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z25.b, z0.b[0]\n"
"sdot z14.s, z25.b, z1.b[0]\n"
"sdot z18.s, z25.b, z2.b[0]\n"
@@ -1140,20 +1140,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z24.b, z2.b[0]\n"
"sdot z23.s, z24.b, z3.b[0]\n"
"ble 48f\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z25.b, z0.b[1]\n"
"sdot z12.s, z25.b, z1.b[1]\n"
"sdot z16.s, z25.b, z2.b[1]\n"
"sdot z20.s, z25.b, z3.b[1]\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z9.s, z24.b, z0.b[1]\n"
"sdot z13.s, z24.b, z1.b[1]\n"
"sdot z17.s, z24.b, z2.b[1]\n"
"sdot z21.s, z24.b, z3.b[1]\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z25.b, z0.b[1]\n"
"sdot z14.s, z25.b, z1.b[1]\n"
"sdot z18.s, z25.b, z2.b[1]\n"
@@ -1163,20 +1163,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z24.b, z2.b[1]\n"
"sdot z23.s, z24.b, z3.b[1]\n"
"ble 48f\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z25.b, z0.b[2]\n"
"sdot z12.s, z25.b, z1.b[2]\n"
"sdot z16.s, z25.b, z2.b[2]\n"
"sdot z20.s, z25.b, z3.b[2]\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z9.s, z24.b, z0.b[2]\n"
"sdot z13.s, z24.b, z1.b[2]\n"
"sdot z17.s, z24.b, z2.b[2]\n"
"sdot z21.s, z24.b, z3.b[2]\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z25.b, z0.b[2]\n"
"sdot z14.s, z25.b, z1.b[2]\n"
"sdot z18.s, z25.b, z2.b[2]\n"
@@ -1186,19 +1186,19 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z24.b, z2.b[2]\n"
"sdot z23.s, z24.b, z3.b[2]\n"
"ble 48f\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z25.b, z0.b[3]\n"
"sdot z12.s, z25.b, z1.b[3]\n"
"sdot z16.s, z25.b, z2.b[3]\n"
"sdot z20.s, z25.b, z3.b[3]\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z24.b, z0.b[3]\n"
"sdot z13.s, z24.b, z1.b[3]\n"
"sdot z17.s, z24.b, z2.b[3]\n"
"sdot z21.s, z24.b, z3.b[3]\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z25.b, z0.b[3]\n"
"sdot z14.s, z25.b, z1.b[3]\n"
"sdot z18.s, z25.b, z2.b[3]\n"
@@ -1214,18 +1214,18 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bne 43b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"ld1w { z27.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "add x25, x26, x20\n"
"ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
"ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
- "add z8.s, z8.s, z27.s\n"
- "add z12.s, z12.s, z27.s\n"
"add x24, x25, x20\n"
+ "add z8.s, z8.s, z27.s\n"
+ "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
"add z9.s, z9.s, z26.s\n"
"add z10.s, z10.s, z25.s\n"
+ "addvl x14, x14, #4\n"
"add z11.s, z11.s, z24.s\n"
+ "add z12.s, z12.s, z27.s\n"
"add z13.s, z13.s, z26.s\n"
"add z14.s, z14.s, z25.s\n"
"add z15.s, z15.s, z24.s\n"
@@ -1287,11 +1287,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z25.s, z25.s, #0x1f\n"
"asr z24.s, z24.s, #0x1f\n"
"sqadd z8.s, z8.s, z27.s\n"
- "and z27.d, z12.d, z0.d\n"
"sqadd z9.s, z9.s, z26.s\n"
- "and z26.d, z13.d, z1.d\n"
"sqadd z10.s, z10.s, z25.s\n"
"sqadd z11.s, z11.s, z24.s\n"
+ "and z27.d, z12.d, z0.d\n"
+ "and z26.d, z13.d, z1.d\n"
"and z25.d, z14.d, z2.d\n"
"and z24.d, z15.d, z3.d\n"
"asr z27.s, z27.s, #0x1f\n"
@@ -1299,11 +1299,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z25.s, z25.s, #0x1f\n"
"asr z24.s, z24.s, #0x1f\n"
"sqadd z12.s, z12.s, z27.s\n"
- "and z27.d, z16.d, z0.d\n"
"sqadd z13.s, z13.s, z26.s\n"
- "and z26.d, z17.d, z1.d\n"
"sqadd z14.s, z14.s, z25.s\n"
"sqadd z15.s, z15.s, z24.s\n"
+ "and z27.d, z16.d, z0.d\n"
+ "and z26.d, z17.d, z1.d\n"
"and z25.d, z18.d, z2.d\n"
"and z24.d, z19.d, z3.d\n"
"asr z27.s, z27.s, #0x1f\n"
@@ -1311,11 +1311,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z25.s, z25.s, #0x1f\n"
"asr z24.s, z24.s, #0x1f\n"
"sqadd z16.s, z16.s, z27.s\n"
- "and z27.d, z20.d, z0.d\n"
"sqadd z17.s, z17.s, z26.s\n"
- "and z26.d, z21.d, z1.d\n"
"sqadd z18.s, z18.s, z25.s\n"
"sqadd z19.s, z19.s, z24.s\n"
+ "and z27.d, z20.d, z0.d\n"
+ "and z26.d, z21.d, z1.d\n"
"and z25.d, z22.d, z2.d\n"
"and z24.d, z23.d, z3.d\n"
"asr z27.s, z27.s, #0x1f\n"
@@ -1328,43 +1328,43 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z23.s, z23.s, z24.s\n"
"51:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
"ld1rw { z25.s }, p2/Z, [x20]\n"
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z8.s, z8.s, z25.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z25.s\n"
+ "add z10.s, z10.s, z25.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z11.s, z11.s, z25.s\n"
+ "add z12.s, z12.s, z25.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z8.s, z8.s, z25.s\n"
+ "add z13.s, z13.s, z25.s\n"
+ "add z14.s, z14.s, z25.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z25.s\n"
- "add z10.s, z10.s, z25.s\n"
+ "add z15.s, z15.s, z25.s\n"
+ "add z16.s, z16.s, z25.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z25.s\n"
- "add z12.s, z12.s, z25.s\n"
+ "add z17.s, z17.s, z25.s\n"
+ "add z18.s, z18.s, z25.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z13.s, z13.s, z25.s\n"
- "add z14.s, z14.s, z25.s\n"
+ "add z19.s, z19.s, z25.s\n"
+ "add z20.s, z20.s, z25.s\n"
".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
- "add z15.s, z15.s, z25.s\n"
- "add z16.s, z16.s, z25.s\n"
+ "add z21.s, z21.s, z25.s\n"
+ "add z22.s, z22.s, z25.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z25.s\n"
- "add z18.s, z18.s, z25.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z25.s\n"
- "add z20.s, z20.s, z25.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z21.s, z21.s, z25.s\n"
- "add z22.s, z22.s, z25.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
"add z23.s, z23.s, z25.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
"smin z8.s, p2/M, z8.s, z24.s\n"
"smin z9.s, p2/M, z9.s, z24.s\n"
"smin z10.s, p2/M, z10.s, z24.s\n"
@@ -1381,60 +1381,60 @@ void sve_hybrid_s8qs_dot_6x4VL (
"smin z21.s, p2/M, z21.s, z24.s\n"
"smin z22.s, p2/M, z22.s, z24.s\n"
"smin z23.s, p2/M, z23.s, z24.s\n"
- "smax z8.s, p2/M, z8.s, z26.s\n"
- "smax z9.s, p2/M, z9.s, z26.s\n"
- "smax z10.s, p2/M, z10.s, z26.s\n"
- "smax z11.s, p2/M, z11.s, z26.s\n"
- "smax z12.s, p2/M, z12.s, z26.s\n"
- "smax z13.s, p2/M, z13.s, z26.s\n"
- "smax z14.s, p2/M, z14.s, z26.s\n"
- "smax z15.s, p2/M, z15.s, z26.s\n"
+ "smax z8.s, p2/M, z8.s, z25.s\n"
+ "smax z9.s, p2/M, z9.s, z25.s\n"
+ "smax z10.s, p2/M, z10.s, z25.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z26.s\n"
- "smax z17.s, p2/M, z17.s, z26.s\n"
- "uzp1 z25.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z26.s\n"
- "smax z19.s, p2/M, z19.s, z26.s\n"
+ "smax z11.s, p2/M, z11.s, z25.s\n"
+ "smax z12.s, p2/M, z12.s, z25.s\n"
+ "uzp1 z24.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z24.b\n"
+ "smax z13.s, p2/M, z13.s, z25.s\n"
+ "smax z14.s, p2/M, z14.s, z25.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "smax z20.s, p2/M, z20.s, z26.s\n"
- "smax z21.s, p2/M, z21.s, z26.s\n"
+ "st1b { z8.b }, p1, [x11]\n"
+ "smax z15.s, p2/M, z15.s, z25.s\n"
+ "smax z16.s, p2/M, z16.s, z25.s\n"
"uzp1 z24.h, z14.h, z15.h\n"
- "smax z22.s, p2/M, z22.s, z26.s\n"
- "smax z23.s, p2/M, z23.s, z26.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z25.b\n"
- "uzp1 z18.h, z18.h, z19.h\n"
- "uzp1 z20.h, z20.h, z21.h\n"
"uzp1 z12.b, z12.b, z24.b\n"
- "uzp1 z17.h, z22.h, z23.h\n"
- "st1b { z8.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z20.b, z20.b, z17.b\n"
+ "smax z17.s, p2/M, z17.s, z25.s\n"
+ "smax z18.s, p2/M, z18.s, z25.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z25.s\n"
+ "smax z20.s, p2/M, z20.s, z25.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "smax z21.s, p2/M, z21.s, z25.s\n"
+ "smax z22.s, p2/M, z22.s, z25.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x25]\n"
+ "smax z23.s, p2/M, z23.s, z25.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"st1b { z20.b }, p1, [x24]\n"
+ "addvl x11, x11, #1\n"
"52:" // Height 4: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -1453,8 +1453,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov x28, #0x0\n"
"56:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 57f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1482,105 +1482,105 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ble 60f\n"
"59:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z4.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
"sdot z8.s, z29.b, z4.b[0]\n"
"sdot z12.s, z29.b, z3.b[0]\n"
- "sdot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z16.s, z29.b, z2.b[0]\n"
"sdot z20.s, z29.b, z1.b[0]\n"
+ "add x25, x25, #0x10\n"
"sdot z24.s, z29.b, z0.b[0]\n"
+ "sdot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"sdot z13.s, z28.b, z3.b[0]\n"
- "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z17.s, z28.b, z2.b[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"sdot z21.s, z28.b, z1.b[0]\n"
"sdot z25.s, z28.b, z0.b[0]\n"
- "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z29.b, z4.b[0]\n"
"sdot z14.s, z29.b, z3.b[0]\n"
"sdot z18.s, z29.b, z2.b[0]\n"
"sdot z22.s, z29.b, z1.b[0]\n"
"sdot z26.s, z29.b, z0.b[0]\n"
- "ld1b { z29.b }, p2/Z, [x10, #4, MUL VL]\n"
"sdot z11.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #4, MUL VL]\n"
"sdot z15.s, z28.b, z3.b[0]\n"
"sdot z19.s, z28.b, z2.b[0]\n"
"sdot z23.s, z28.b, z1.b[0]\n"
"sdot z27.s, z28.b, z0.b[0]\n"
- "ld1b { z28.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z29.b, z4.b[1]\n"
"sdot z12.s, z29.b, z3.b[1]\n"
"sdot z16.s, z29.b, z2.b[1]\n"
"sdot z20.s, z29.b, z1.b[1]\n"
"sdot z24.s, z29.b, z0.b[1]\n"
- "ld1b { z29.b }, p2/Z, [x10, #6, MUL VL]\n"
"sdot z9.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #6, MUL VL]\n"
"sdot z13.s, z28.b, z3.b[1]\n"
"sdot z17.s, z28.b, z2.b[1]\n"
"sdot z21.s, z28.b, z1.b[1]\n"
"sdot z25.s, z28.b, z0.b[1]\n"
- "ld1b { z28.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z28.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z29.b, z4.b[1]\n"
"sdot z14.s, z29.b, z3.b[1]\n"
"sdot z18.s, z29.b, z2.b[1]\n"
"sdot z22.s, z29.b, z1.b[1]\n"
"sdot z26.s, z29.b, z0.b[1]\n"
"sdot z11.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-8, MUL VL]\n"
"sdot z15.s, z28.b, z3.b[1]\n"
- "ld1b { z29.b }, p2/Z, [x10, #-8, MUL VL]\n"
"sdot z19.s, z28.b, z2.b[1]\n"
"sdot z23.s, z28.b, z1.b[1]\n"
"sdot z27.s, z28.b, z0.b[1]\n"
- "ld1b { z28.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z29.b, z4.b[2]\n"
"sdot z12.s, z29.b, z3.b[2]\n"
"sdot z16.s, z29.b, z2.b[2]\n"
"sdot z20.s, z29.b, z1.b[2]\n"
"sdot z24.s, z29.b, z0.b[2]\n"
- "ld1b { z29.b }, p2/Z, [x10, #-6, MUL VL]\n"
"sdot z9.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-6, MUL VL]\n"
"sdot z13.s, z28.b, z3.b[2]\n"
"sdot z17.s, z28.b, z2.b[2]\n"
"sdot z21.s, z28.b, z1.b[2]\n"
"sdot z25.s, z28.b, z0.b[2]\n"
- "ld1b { z28.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z29.b, z4.b[2]\n"
"sdot z14.s, z29.b, z3.b[2]\n"
"sdot z18.s, z29.b, z2.b[2]\n"
"sdot z22.s, z29.b, z1.b[2]\n"
"sdot z26.s, z29.b, z0.b[2]\n"
- "ld1b { z29.b }, p2/Z, [x10, #-4, MUL VL]\n"
"sdot z11.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-4, MUL VL]\n"
"sdot z15.s, z28.b, z3.b[2]\n"
"sdot z19.s, z28.b, z2.b[2]\n"
"sdot z23.s, z28.b, z1.b[2]\n"
"sdot z27.s, z28.b, z0.b[2]\n"
- "ld1b { z28.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z29.b, z4.b[3]\n"
"sdot z12.s, z29.b, z3.b[3]\n"
"sdot z16.s, z29.b, z2.b[3]\n"
"sdot z20.s, z29.b, z1.b[3]\n"
"sdot z24.s, z29.b, z0.b[3]\n"
- "ld1b { z29.b }, p2/Z, [x10, #-2, MUL VL]\n"
"sdot z9.s, z28.b, z4.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-2, MUL VL]\n"
"sdot z13.s, z28.b, z3.b[3]\n"
"sdot z17.s, z28.b, z2.b[3]\n"
"sdot z21.s, z28.b, z1.b[3]\n"
"sdot z25.s, z28.b, z0.b[3]\n"
- "ld1b { z28.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z29.b, z4.b[3]\n"
"sdot z14.s, z29.b, z3.b[3]\n"
"sdot z18.s, z29.b, z2.b[3]\n"
@@ -1594,27 +1594,27 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bgt 59b\n"
"60:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
"sdot z8.s, z29.b, z0.b[0]\n"
"sdot z12.s, z29.b, z1.b[0]\n"
- "sdot z9.s, z28.b, z0.b[0]\n"
- "sdot z13.s, z28.b, z1.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z16.s, z29.b, z2.b[0]\n"
"sdot z20.s, z29.b, z3.b[0]\n"
"sdot z24.s, z29.b, z4.b[0]\n"
+ "sdot z9.s, z28.b, z0.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[0]\n"
"sdot z17.s, z28.b, z2.b[0]\n"
- "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z21.s, z28.b, z3.b[0]\n"
"sdot z25.s, z28.b, z4.b[0]\n"
- "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z29.b, z0.b[0]\n"
"sdot z14.s, z29.b, z1.b[0]\n"
"sdot z18.s, z29.b, z2.b[0]\n"
@@ -1626,23 +1626,23 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z28.b, z3.b[0]\n"
"sdot z27.s, z28.b, z4.b[0]\n"
"ble 61f\n"
- "ld1b { z29.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z29.b, z0.b[1]\n"
"sdot z12.s, z29.b, z1.b[1]\n"
"sdot z16.s, z29.b, z2.b[1]\n"
"sdot z20.s, z29.b, z3.b[1]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z29.b, z4.b[1]\n"
- "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z28.b, z0.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z28.b, z1.b[1]\n"
"sdot z17.s, z28.b, z2.b[1]\n"
"sdot z21.s, z28.b, z3.b[1]\n"
"sdot z25.s, z28.b, z4.b[1]\n"
- "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z29.b, z0.b[1]\n"
- "addvl x10, x10, #4\n"
"sdot z14.s, z29.b, z1.b[1]\n"
"sdot z18.s, z29.b, z2.b[1]\n"
"sdot z22.s, z29.b, z3.b[1]\n"
@@ -1653,23 +1653,23 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z28.b, z3.b[1]\n"
"sdot z27.s, z28.b, z4.b[1]\n"
"ble 61f\n"
- "ld1b { z29.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z29.b, z0.b[2]\n"
"sdot z12.s, z29.b, z1.b[2]\n"
"sdot z16.s, z29.b, z2.b[2]\n"
"sdot z20.s, z29.b, z3.b[2]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z29.b, z4.b[2]\n"
- "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z28.b, z0.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z28.b, z1.b[2]\n"
"sdot z17.s, z28.b, z2.b[2]\n"
"sdot z21.s, z28.b, z3.b[2]\n"
"sdot z25.s, z28.b, z4.b[2]\n"
- "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z29.b, z0.b[2]\n"
- "addvl x10, x10, #4\n"
"sdot z14.s, z29.b, z1.b[2]\n"
"sdot z18.s, z29.b, z2.b[2]\n"
"sdot z22.s, z29.b, z3.b[2]\n"
@@ -1680,21 +1680,21 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z28.b, z3.b[2]\n"
"sdot z27.s, z28.b, z4.b[2]\n"
"ble 61f\n"
- "ld1b { z29.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z29.b, z0.b[3]\n"
"sdot z12.s, z29.b, z1.b[3]\n"
"sdot z16.s, z29.b, z2.b[3]\n"
"sdot z20.s, z29.b, z3.b[3]\n"
"sdot z24.s, z29.b, z4.b[3]\n"
- "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z28.b, z0.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z28.b, z1.b[3]\n"
"sdot z17.s, z28.b, z2.b[3]\n"
"sdot z21.s, z28.b, z3.b[3]\n"
"sdot z25.s, z28.b, z4.b[3]\n"
- "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z29.b, z0.b[3]\n"
"sdot z14.s, z29.b, z1.b[3]\n"
"sdot z18.s, z29.b, z2.b[3]\n"
@@ -1711,20 +1711,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
"cmp x28, x20\n"
"bne 56b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x11, x20\n"
"ld1w { z31.s }, p2/Z, [x14]\n"
+ "add x25, x26, x20\n"
"ld1w { z30.s }, p2/Z, [x14, #1, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
- "add z8.s, z8.s, z31.s\n"
- "add z12.s, z12.s, z31.s\n"
"add x24, x25, x20\n"
"add x23, x24, x20\n"
+ "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add z8.s, z8.s, z31.s\n"
"add z9.s, z9.s, z30.s\n"
+ "addvl x14, x14, #4\n"
"add z10.s, z10.s, z29.s\n"
"add z11.s, z11.s, z28.s\n"
+ "add z12.s, z12.s, z31.s\n"
"add z13.s, z13.s, z30.s\n"
"add z14.s, z14.s, z29.s\n"
"add z15.s, z15.s, z28.s\n"
@@ -1794,11 +1794,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"sqadd z8.s, z8.s, z31.s\n"
- "and z31.d, z12.d, z0.d\n"
"sqadd z9.s, z9.s, z30.s\n"
- "and z30.d, z13.d, z1.d\n"
"sqadd z10.s, z10.s, z29.s\n"
"sqadd z11.s, z11.s, z28.s\n"
+ "and z31.d, z12.d, z0.d\n"
+ "and z30.d, z13.d, z1.d\n"
"and z29.d, z14.d, z2.d\n"
"and z28.d, z15.d, z3.d\n"
"asr z31.s, z31.s, #0x1f\n"
@@ -1806,11 +1806,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"sqadd z12.s, z12.s, z31.s\n"
- "and z31.d, z16.d, z0.d\n"
"sqadd z13.s, z13.s, z30.s\n"
- "and z30.d, z17.d, z1.d\n"
"sqadd z14.s, z14.s, z29.s\n"
"sqadd z15.s, z15.s, z28.s\n"
+ "and z31.d, z16.d, z0.d\n"
+ "and z30.d, z17.d, z1.d\n"
"and z29.d, z18.d, z2.d\n"
"and z28.d, z19.d, z3.d\n"
"asr z31.s, z31.s, #0x1f\n"
@@ -1818,11 +1818,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"sqadd z16.s, z16.s, z31.s\n"
- "and z31.d, z20.d, z0.d\n"
"sqadd z17.s, z17.s, z30.s\n"
- "and z30.d, z21.d, z1.d\n"
"sqadd z18.s, z18.s, z29.s\n"
"sqadd z19.s, z19.s, z28.s\n"
+ "and z31.d, z20.d, z0.d\n"
+ "and z30.d, z21.d, z1.d\n"
"and z29.d, z22.d, z2.d\n"
"and z28.d, z23.d, z3.d\n"
"asr z31.s, z31.s, #0x1f\n"
@@ -1830,11 +1830,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"sqadd z20.s, z20.s, z31.s\n"
- "and z31.d, z24.d, z0.d\n"
"sqadd z21.s, z21.s, z30.s\n"
- "and z30.d, z25.d, z1.d\n"
"sqadd z22.s, z22.s, z29.s\n"
"sqadd z23.s, z23.s, z28.s\n"
+ "and z31.d, z24.d, z0.d\n"
+ "and z30.d, z25.d, z1.d\n"
"and z29.d, z26.d, z2.d\n"
"and z28.d, z27.d, z3.d\n"
"asr z31.s, z31.s, #0x1f\n"
@@ -1847,51 +1847,51 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z27.s, z27.s, z28.s\n"
"64:" // Height 5: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
"ld1rw { z29.s }, p2/Z, [x20]\n"
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z8.s, z8.s, z29.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z29.s\n"
+ "add z10.s, z10.s, z29.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z11.s, z11.s, z29.s\n"
+ "add z12.s, z12.s, z29.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z8.s, z8.s, z29.s\n"
+ "add z13.s, z13.s, z29.s\n"
+ "add z14.s, z14.s, z29.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z29.s\n"
- "add z10.s, z10.s, z29.s\n"
+ "add z15.s, z15.s, z29.s\n"
+ "add z16.s, z16.s, z29.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z29.s\n"
- "add z12.s, z12.s, z29.s\n"
+ "add z17.s, z17.s, z29.s\n"
+ "add z18.s, z18.s, z29.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z13.s, z13.s, z29.s\n"
- "add z14.s, z14.s, z29.s\n"
+ "add z19.s, z19.s, z29.s\n"
+ "add z20.s, z20.s, z29.s\n"
".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
- "add z15.s, z15.s, z29.s\n"
- "add z16.s, z16.s, z29.s\n"
+ "add z21.s, z21.s, z29.s\n"
+ "add z22.s, z22.s, z29.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z29.s\n"
- "add z18.s, z18.s, z29.s\n"
+ "add z23.s, z23.s, z29.s\n"
+ "add z24.s, z24.s, z29.s\n"
".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
- "add z19.s, z19.s, z29.s\n"
- "add z20.s, z20.s, z29.s\n"
+ "add z25.s, z25.s, z29.s\n"
+ "add z26.s, z26.s, z29.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z21.s, z21.s, z29.s\n"
- "add z22.s, z22.s, z29.s\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z29.s\n"
- "add z24.s, z24.s, z29.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z29.s\n"
- "add z26.s, z26.s, z29.s\n"
- "ld1rw { z30.s }, p2/Z, [x20]\n"
"add z27.s, z27.s, z29.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
"smin z8.s, p2/M, z8.s, z28.s\n"
"smin z9.s, p2/M, z9.s, z28.s\n"
"smin z10.s, p2/M, z10.s, z28.s\n"
@@ -1912,72 +1912,71 @@ void sve_hybrid_s8qs_dot_6x4VL (
"smin z25.s, p2/M, z25.s, z28.s\n"
"smin z26.s, p2/M, z26.s, z28.s\n"
"smin z27.s, p2/M, z27.s, z28.s\n"
- "smax z8.s, p2/M, z8.s, z30.s\n"
- "smax z9.s, p2/M, z9.s, z30.s\n"
- "smax z10.s, p2/M, z10.s, z30.s\n"
- "smax z11.s, p2/M, z11.s, z30.s\n"
- "smax z12.s, p2/M, z12.s, z30.s\n"
- "smax z13.s, p2/M, z13.s, z30.s\n"
- "smax z14.s, p2/M, z14.s, z30.s\n"
- "smax z15.s, p2/M, z15.s, z30.s\n"
+ "smax z8.s, p2/M, z8.s, z29.s\n"
+ "smax z9.s, p2/M, z9.s, z29.s\n"
+ "smax z10.s, p2/M, z10.s, z29.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z30.s\n"
- "smax z17.s, p2/M, z17.s, z30.s\n"
- "uzp1 z29.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z30.s\n"
- "smax z19.s, p2/M, z19.s, z30.s\n"
+ "smax z11.s, p2/M, z11.s, z29.s\n"
+ "smax z12.s, p2/M, z12.s, z29.s\n"
+ "uzp1 z28.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z28.b\n"
+ "smax z13.s, p2/M, z13.s, z29.s\n"
+ "smax z14.s, p2/M, z14.s, z29.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "smax z20.s, p2/M, z20.s, z30.s\n"
- "smax z21.s, p2/M, z21.s, z30.s\n"
+ "st1b { z8.b }, p1, [x11]\n"
+ "smax z15.s, p2/M, z15.s, z29.s\n"
+ "smax z16.s, p2/M, z16.s, z29.s\n"
"uzp1 z28.h, z14.h, z15.h\n"
- "smax z22.s, p2/M, z22.s, z30.s\n"
- "smax z23.s, p2/M, z23.s, z30.s\n"
+ "uzp1 z12.b, z12.b, z28.b\n"
+ "smax z17.s, p2/M, z17.s, z29.s\n"
+ "smax z18.s, p2/M, z18.s, z29.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z29.b\n"
- "smax z24.s, p2/M, z24.s, z30.s\n"
- "smax z25.s, p2/M, z25.s, z30.s\n"
+ "st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z29.s\n"
+ "smax z20.s, p2/M, z20.s, z29.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "smax z26.s, p2/M, z26.s, z30.s\n"
- "smax z27.s, p2/M, z27.s, z30.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z12.b, z12.b, z28.b\n"
- "uzp1 z18.h, z22.h, z23.h\n"
- "st1b { z8.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "uzp1 z24.h, z24.h, z25.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "st1b { z12.b }, p1, [x26]\n"
- "uzp1 z20.b, z20.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
+ "smax z21.s, p2/M, z21.s, z29.s\n"
+ "smax z22.s, p2/M, z22.s, z29.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x25]\n"
+ "smax z23.s, p2/M, z23.s, z29.s\n"
+ "smax z24.s, p2/M, z24.s, z29.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z29.s\n"
+ "smax z26.s, p2/M, z26.s, z29.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
"st1b { z20.b }, p1, [x24]\n"
+ "smax z27.s, p2/M, z27.s, z29.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x23]\n"
+ "addvl x11, x11, #1\n"
"65:" // Height 5: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 54b\n"
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x6\n"
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -2000,8 +1999,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov x28, #0x0\n"
"69:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 70f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -2032,121 +2031,121 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ble 73f\n"
"72:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p2/Z, [x10]\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z7.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z6.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z5.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z2.b }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1b { z1.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z1.b, z7.b[0]\n"
"sdot z12.s, z1.b, z6.b[0]\n"
- "add x21, x21, #0x10\n"
"sdot z16.s, z1.b, z5.b[0]\n"
"sdot z20.s, z1.b, z4.b[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"sdot z24.s, z1.b, z3.b[0]\n"
"sdot z28.s, z1.b, z2.b[0]\n"
- "ld1b { z1.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"sdot z9.s, z0.b, z7.b[0]\n"
"sdot z13.s, z0.b, z6.b[0]\n"
"sdot z17.s, z0.b, z5.b[0]\n"
"sdot z21.s, z0.b, z4.b[0]\n"
"sdot z25.s, z0.b, z3.b[0]\n"
"sdot z29.s, z0.b, z2.b[0]\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z1.b, z7.b[0]\n"
"sdot z14.s, z1.b, z6.b[0]\n"
"sdot z18.s, z1.b, z5.b[0]\n"
"sdot z22.s, z1.b, z4.b[0]\n"
"sdot z26.s, z1.b, z3.b[0]\n"
"sdot z30.s, z1.b, z2.b[0]\n"
- "ld1b { z1.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z0.b, z7.b[0]\n"
"sdot z15.s, z0.b, z6.b[0]\n"
"sdot z19.s, z0.b, z5.b[0]\n"
"sdot z23.s, z0.b, z4.b[0]\n"
"sdot z27.s, z0.b, z3.b[0]\n"
"sdot z31.s, z0.b, z2.b[0]\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z1.b, z7.b[1]\n"
"sdot z12.s, z1.b, z6.b[1]\n"
"sdot z16.s, z1.b, z5.b[1]\n"
"sdot z20.s, z1.b, z4.b[1]\n"
"sdot z24.s, z1.b, z3.b[1]\n"
"sdot z28.s, z1.b, z2.b[1]\n"
- "ld1b { z1.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z0.b, z7.b[1]\n"
"sdot z13.s, z0.b, z6.b[1]\n"
"sdot z17.s, z0.b, z5.b[1]\n"
"sdot z21.s, z0.b, z4.b[1]\n"
"sdot z25.s, z0.b, z3.b[1]\n"
"sdot z29.s, z0.b, z2.b[1]\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z1.b, z7.b[1]\n"
"sdot z14.s, z1.b, z6.b[1]\n"
"sdot z18.s, z1.b, z5.b[1]\n"
"sdot z22.s, z1.b, z4.b[1]\n"
"sdot z26.s, z1.b, z3.b[1]\n"
"sdot z30.s, z1.b, z2.b[1]\n"
- "ld1b { z1.b }, p2/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z0.b, z7.b[1]\n"
"sdot z15.s, z0.b, z6.b[1]\n"
"sdot z19.s, z0.b, z5.b[1]\n"
"sdot z23.s, z0.b, z4.b[1]\n"
"sdot z27.s, z0.b, z3.b[1]\n"
"sdot z31.s, z0.b, z2.b[1]\n"
- "ld1b { z0.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z1.b, z7.b[2]\n"
"sdot z12.s, z1.b, z6.b[2]\n"
"sdot z16.s, z1.b, z5.b[2]\n"
"sdot z20.s, z1.b, z4.b[2]\n"
"sdot z24.s, z1.b, z3.b[2]\n"
"sdot z28.s, z1.b, z2.b[2]\n"
- "ld1b { z1.b }, p2/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z0.b, z7.b[2]\n"
"sdot z13.s, z0.b, z6.b[2]\n"
"sdot z17.s, z0.b, z5.b[2]\n"
"sdot z21.s, z0.b, z4.b[2]\n"
"sdot z25.s, z0.b, z3.b[2]\n"
"sdot z29.s, z0.b, z2.b[2]\n"
- "ld1b { z0.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z1.b, z7.b[2]\n"
"sdot z14.s, z1.b, z6.b[2]\n"
"sdot z18.s, z1.b, z5.b[2]\n"
"sdot z22.s, z1.b, z4.b[2]\n"
"sdot z26.s, z1.b, z3.b[2]\n"
"sdot z30.s, z1.b, z2.b[2]\n"
- "ld1b { z1.b }, p2/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z0.b, z7.b[2]\n"
"sdot z15.s, z0.b, z6.b[2]\n"
"sdot z19.s, z0.b, z5.b[2]\n"
"sdot z23.s, z0.b, z4.b[2]\n"
"sdot z27.s, z0.b, z3.b[2]\n"
"sdot z31.s, z0.b, z2.b[2]\n"
- "ld1b { z0.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z1.b, z7.b[3]\n"
"sdot z12.s, z1.b, z6.b[3]\n"
"sdot z16.s, z1.b, z5.b[3]\n"
"sdot z20.s, z1.b, z4.b[3]\n"
"sdot z24.s, z1.b, z3.b[3]\n"
"sdot z28.s, z1.b, z2.b[3]\n"
- "ld1b { z1.b }, p2/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z0.b, z7.b[3]\n"
"sdot z13.s, z0.b, z6.b[3]\n"
"sdot z17.s, z0.b, z5.b[3]\n"
"sdot z21.s, z0.b, z4.b[3]\n"
"sdot z25.s, z0.b, z3.b[3]\n"
"sdot z29.s, z0.b, z2.b[3]\n"
- "ld1b { z0.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z1.b, z7.b[3]\n"
"sdot z14.s, z1.b, z6.b[3]\n"
"sdot z18.s, z1.b, z5.b[3]\n"
@@ -2162,30 +2161,30 @@ void sve_hybrid_s8qs_dot_6x4VL (
"bgt 72b\n"
"73:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x10]\n"
- "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z7.b, z0.b[0]\n"
"sdot z12.s, z7.b, z1.b[0]\n"
- "sdot z9.s, z6.b, z0.b[0]\n"
- "sdot z13.s, z6.b, z1.b[0]\n"
"sdot z16.s, z7.b, z2.b[0]\n"
"sdot z20.s, z7.b, z3.b[0]\n"
"sdot z24.s, z7.b, z4.b[0]\n"
"sdot z28.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[0]\n"
+ "sdot z13.s, z6.b, z1.b[0]\n"
"sdot z17.s, z6.b, z2.b[0]\n"
"sdot z21.s, z6.b, z3.b[0]\n"
"sdot z25.s, z6.b, z4.b[0]\n"
"sdot z29.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z7.b, z0.b[0]\n"
"sdot z14.s, z7.b, z1.b[0]\n"
"sdot z18.s, z7.b, z2.b[0]\n"
@@ -2199,25 +2198,25 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z27.s, z6.b, z4.b[0]\n"
"sdot z31.s, z6.b, z5.b[0]\n"
"ble 74f\n"
- "ld1b { z7.b }, p2/Z, [x10]\n"
- "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z7.b, z0.b[1]\n"
"sdot z12.s, z7.b, z1.b[1]\n"
"sdot z16.s, z7.b, z2.b[1]\n"
"sdot z20.s, z7.b, z3.b[1]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z7.b, z4.b[1]\n"
"sdot z28.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z6.b, z1.b[1]\n"
"sdot z17.s, z6.b, z2.b[1]\n"
"sdot z21.s, z6.b, z3.b[1]\n"
"sdot z25.s, z6.b, z4.b[1]\n"
"sdot z29.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z7.b, z0.b[1]\n"
- "addvl x10, x10, #4\n"
"sdot z14.s, z7.b, z1.b[1]\n"
"sdot z18.s, z7.b, z2.b[1]\n"
"sdot z22.s, z7.b, z3.b[1]\n"
@@ -2230,25 +2229,25 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z27.s, z6.b, z4.b[1]\n"
"sdot z31.s, z6.b, z5.b[1]\n"
"ble 74f\n"
- "ld1b { z7.b }, p2/Z, [x10]\n"
- "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z7.b, z0.b[2]\n"
"sdot z12.s, z7.b, z1.b[2]\n"
"sdot z16.s, z7.b, z2.b[2]\n"
"sdot z20.s, z7.b, z3.b[2]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z7.b, z4.b[2]\n"
"sdot z28.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z6.b, z1.b[2]\n"
"sdot z17.s, z6.b, z2.b[2]\n"
"sdot z21.s, z6.b, z3.b[2]\n"
"sdot z25.s, z6.b, z4.b[2]\n"
"sdot z29.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z7.b, z0.b[2]\n"
- "addvl x10, x10, #4\n"
"sdot z14.s, z7.b, z1.b[2]\n"
"sdot z18.s, z7.b, z2.b[2]\n"
"sdot z22.s, z7.b, z3.b[2]\n"
@@ -2261,23 +2260,23 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z27.s, z6.b, z4.b[2]\n"
"sdot z31.s, z6.b, z5.b[2]\n"
"ble 74f\n"
- "ld1b { z7.b }, p2/Z, [x10]\n"
- "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
"sdot z8.s, z7.b, z0.b[3]\n"
"sdot z12.s, z7.b, z1.b[3]\n"
"sdot z16.s, z7.b, z2.b[3]\n"
"sdot z20.s, z7.b, z3.b[3]\n"
"sdot z24.s, z7.b, z4.b[3]\n"
"sdot z28.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z6.b, z0.b[3]\n"
"sdot z13.s, z6.b, z1.b[3]\n"
"sdot z17.s, z6.b, z2.b[3]\n"
"sdot z21.s, z6.b, z3.b[3]\n"
"sdot z25.s, z6.b, z4.b[3]\n"
"sdot z29.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z7.b, z0.b[3]\n"
"sdot z14.s, z7.b, z1.b[3]\n"
"sdot z18.s, z7.b, z2.b[3]\n"
@@ -2296,21 +2295,21 @@ void sve_hybrid_s8qs_dot_6x4VL (
"cmp x28, x20\n"
"bne 69b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x11, x20\n"
+ "add x25, x26, x20\n"
"ld1w { z3.s }, p2/Z, [x14]\n"
"ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
"ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add x26, x9, x20\n"
- "add x25, x26, x20\n"
- "add z8.s, z8.s, z3.s\n"
- "add z12.s, z12.s, z3.s\n"
"add x24, x25, x20\n"
"add x23, x24, x20\n"
+ "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add x22, x23, x20\n"
+ "add z8.s, z8.s, z3.s\n"
"add z9.s, z9.s, z2.s\n"
"add z10.s, z10.s, z1.s\n"
- "add x22, x23, x20\n"
"add z11.s, z11.s, z0.s\n"
+ "addvl x14, x14, #4\n"
+ "add z12.s, z12.s, z3.s\n"
"add z13.s, z13.s, z2.s\n"
"add z14.s, z14.s, z1.s\n"
"add z15.s, z15.s, z0.s\n"
@@ -2388,11 +2387,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z8.s, z8.s, z7.s\n"
- "and z7.d, z12.d, z0.d\n"
"sqadd z9.s, z9.s, z6.s\n"
- "and z6.d, z13.d, z1.d\n"
"sqadd z10.s, z10.s, z5.s\n"
"sqadd z11.s, z11.s, z4.s\n"
+ "and z7.d, z12.d, z0.d\n"
+ "and z6.d, z13.d, z1.d\n"
"and z5.d, z14.d, z2.d\n"
"and z4.d, z15.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2400,11 +2399,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z12.s, z12.s, z7.s\n"
- "and z7.d, z16.d, z0.d\n"
"sqadd z13.s, z13.s, z6.s\n"
- "and z6.d, z17.d, z1.d\n"
"sqadd z14.s, z14.s, z5.s\n"
"sqadd z15.s, z15.s, z4.s\n"
+ "and z7.d, z16.d, z0.d\n"
+ "and z6.d, z17.d, z1.d\n"
"and z5.d, z18.d, z2.d\n"
"and z4.d, z19.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2412,11 +2411,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z16.s, z16.s, z7.s\n"
- "and z7.d, z20.d, z0.d\n"
"sqadd z17.s, z17.s, z6.s\n"
- "and z6.d, z21.d, z1.d\n"
"sqadd z18.s, z18.s, z5.s\n"
"sqadd z19.s, z19.s, z4.s\n"
+ "and z7.d, z20.d, z0.d\n"
+ "and z6.d, z21.d, z1.d\n"
"and z5.d, z22.d, z2.d\n"
"and z4.d, z23.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2424,11 +2423,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z20.s, z20.s, z7.s\n"
- "and z7.d, z24.d, z0.d\n"
"sqadd z21.s, z21.s, z6.s\n"
- "and z6.d, z25.d, z1.d\n"
"sqadd z22.s, z22.s, z5.s\n"
"sqadd z23.s, z23.s, z4.s\n"
+ "and z7.d, z24.d, z0.d\n"
+ "and z6.d, z25.d, z1.d\n"
"and z5.d, z26.d, z2.d\n"
"and z4.d, z27.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2436,11 +2435,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z24.s, z24.s, z7.s\n"
- "and z7.d, z28.d, z0.d\n"
"sqadd z25.s, z25.s, z6.s\n"
- "and z6.d, z29.d, z1.d\n"
"sqadd z26.s, z26.s, z5.s\n"
"sqadd z27.s, z27.s, z4.s\n"
+ "and z7.d, z28.d, z0.d\n"
+ "and z6.d, z29.d, z1.d\n"
"and z5.d, z30.d, z2.d\n"
"and z4.d, z31.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2453,59 +2452,59 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z31.s, z31.s, z4.s\n"
"77:" // Height 6: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
"ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z13.s, z13.s, z4.s\n"
+ "add z14.s, z14.s, z4.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z12.s, z12.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z13.s, z13.s, z4.s\n"
- "add z14.s, z14.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
- "add z15.s, z15.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "add z26.s, z26.s, z4.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z27.s, z27.s, z4.s\n"
+ "add z28.s, z28.s, z4.s\n"
".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n"
".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "add z30.s, z30.s, z4.s\n"
".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z4.s\n"
- "add z28.s, z28.s, z4.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z29.s, z29.s, z4.s\n"
- "add z30.s, z30.s, z4.s\n"
- "ld1rw { z2.s }, p2/Z, [x20]\n"
"add z31.s, z31.s, z4.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"smin z8.s, p2/M, z8.s, z0.s\n"
"smin z9.s, p2/M, z9.s, z0.s\n"
"smin z10.s, p2/M, z10.s, z0.s\n"
@@ -2530,58 +2529,58 @@ void sve_hybrid_s8qs_dot_6x4VL (
"smin z29.s, p2/M, z29.s, z0.s\n"
"smin z30.s, p2/M, z30.s, z0.s\n"
"smin z31.s, p2/M, z31.s, z0.s\n"
- "smax z8.s, p2/M, z8.s, z2.s\n"
- "smax z9.s, p2/M, z9.s, z2.s\n"
- "smax z10.s, p2/M, z10.s, z2.s\n"
- "smax z11.s, p2/M, z11.s, z2.s\n"
- "smax z12.s, p2/M, z12.s, z2.s\n"
- "smax z13.s, p2/M, z13.s, z2.s\n"
- "smax z14.s, p2/M, z14.s, z2.s\n"
- "smax z15.s, p2/M, z15.s, z2.s\n"
+ "smax z8.s, p2/M, z8.s, z1.s\n"
+ "smax z9.s, p2/M, z9.s, z1.s\n"
+ "smax z10.s, p2/M, z10.s, z1.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z2.s\n"
- "smax z17.s, p2/M, z17.s, z2.s\n"
- "uzp1 z1.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z2.s\n"
- "smax z19.s, p2/M, z19.s, z2.s\n"
+ "smax z11.s, p2/M, z11.s, z1.s\n"
+ "smax z12.s, p2/M, z12.s, z1.s\n"
+ "uzp1 z0.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z0.b\n"
+ "smax z13.s, p2/M, z13.s, z1.s\n"
+ "smax z14.s, p2/M, z14.s, z1.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "smax z20.s, p2/M, z20.s, z2.s\n"
- "smax z21.s, p2/M, z21.s, z2.s\n"
+ "st1b { z8.b }, p1, [x11]\n"
+ "smax z15.s, p2/M, z15.s, z1.s\n"
+ "smax z16.s, p2/M, z16.s, z1.s\n"
"uzp1 z0.h, z14.h, z15.h\n"
- "smax z22.s, p2/M, z22.s, z2.s\n"
- "smax z23.s, p2/M, z23.s, z2.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z1.b\n"
- "smax z24.s, p2/M, z24.s, z2.s\n"
- "smax z25.s, p2/M, z25.s, z2.s\n"
- "uzp1 z18.h, z18.h, z19.h\n"
- "smax z26.s, p2/M, z26.s, z2.s\n"
- "smax z27.s, p2/M, z27.s, z2.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
"uzp1 z12.b, z12.b, z0.b\n"
- "smax z28.s, p2/M, z28.s, z2.s\n"
- "smax z29.s, p2/M, z29.s, z2.s\n"
- "uzp1 z17.h, z22.h, z23.h\n"
- "st1b { z8.b }, p1, [x9]\n"
- "smax z30.s, p2/M, z30.s, z2.s\n"
- "smax z31.s, p2/M, z31.s, z2.s\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z18.h, z26.h, z27.h\n"
+ "smax z17.s, p2/M, z17.s, z1.s\n"
+ "smax z18.s, p2/M, z18.s, z1.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"st1b { z12.b }, p1, [x26]\n"
- "addvl x9, x9, #1\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z20.b, z20.b, z17.b\n"
- "uzp1 z17.h, z30.h, z31.h\n"
+ "smax z19.s, p2/M, z19.s, z1.s\n"
+ "smax z20.s, p2/M, z20.s, z1.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "smax z21.s, p2/M, z21.s, z1.s\n"
+ "smax z22.s, p2/M, z22.s, z1.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x25]\n"
- "uzp1 z24.b, z24.b, z18.b\n"
- "uzp1 z28.b, z28.b, z17.b\n"
+ "smax z23.s, p2/M, z23.s, z1.s\n"
+ "smax z24.s, p2/M, z24.s, z1.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z1.s\n"
+ "smax z26.s, p2/M, z26.s, z1.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
"st1b { z20.b }, p1, [x24]\n"
+ "smax z27.s, p2/M, z27.s, z1.s\n"
+ "smax z28.s, p2/M, z28.s, z1.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "smax z29.s, p2/M, z29.s, z1.s\n"
+ "smax z30.s, p2/M, z30.s, z1.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
"st1b { z24.b }, p1, [x23]\n"
+ "smax z31.s, p2/M, z31.s, z1.s\n"
+ "uzp1 z16.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z16.b\n"
"st1b { z28.b }, p1, [x22]\n"
+ "addvl x11, x11, #1\n"
"78:" // Height 6: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 67b\n"
"subs %x[M], %x[M], #0x6\n"
"beq 80f\n"
@@ -2595,8 +2594,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
index 73e4bd32b9..11ff5b2f15 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
index 759e3e2f3d..1a1201310e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
@@ -47,18 +47,18 @@ void sve_hybrid_s8qs_mmla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -98,26 +98,26 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"3:" // Height 1: setup done
"mov x28, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -133,87 +133,87 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "trn1 z18.d, z19.d, z22.d\n"
- "trn2 z19.d, z19.d, z22.d\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45119a68 // smmla z8.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45109a6c // smmla z12.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45119a69 // smmla z9.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45109a6d // smmla z13.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45119a6a // smmla z10.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45109a6e // smmla z14.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45119a6b // smmla z11.s, z19.b, z17.b\n"
- ".inst 0x45109a6f // smmla z15.s, z19.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n"
+ ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
+ "add x26, x26, #0x10\n"
"bgt 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
- ".inst 0x45079a49 // smmla z9.s, z18.b, z7.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z26.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x451a9a4b // smmla z11.s, z18.b, z26.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "addvl x9, x9, #8\n"
"ble 9f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
+ "addvl x9, x9, #8\n"
"9:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -227,10 +227,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z11.d, z11.d, z15.d\n"
"ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
"ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
"mov z15.d, z8.d\n"
- "add z9.s, z9.s, z18.s\n"
"add z15.s, z15.s, z19.s\n"
+ "addvl x14, x14, #4\n"
+ "add z9.s, z9.s, z18.s\n"
"add z10.s, z10.s, z17.s\n"
"add z11.s, z11.s, z16.s\n"
"tbz %x[flags], #4, 10f\n"
@@ -276,61 +276,61 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"sqadd z11.s, z11.s, z16.s\n"
"12:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
"ld1rw { z17.s }, p2/Z, [x20]\n"
+ ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
+ "add z15.s, z15.s, z17.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "add z9.s, z9.s, z17.s\n"
+ "add z10.s, z10.s, z17.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z15.s, z15.s, z17.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z9.s, z9.s, z17.s\n"
- "add z10.s, z10.s, z17.s\n"
- "ld1rw { z28.s }, p2/Z, [x20]\n"
"add z11.s, z11.s, z17.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z31.s }, p2/Z, [x20]\n"
"smin z15.s, p2/M, z15.s, z16.s\n"
"smin z9.s, p2/M, z9.s, z16.s\n"
"smin z10.s, p2/M, z10.s, z16.s\n"
"smin z11.s, p2/M, z11.s, z16.s\n"
- "smax z15.s, p2/M, z15.s, z28.s\n"
- "smax z9.s, p2/M, z9.s, z28.s\n"
- "smax z10.s, p2/M, z10.s, z28.s\n"
- "smax z11.s, p2/M, z11.s, z28.s\n"
+ "smax z15.s, p2/M, z15.s, z31.s\n"
+ "smax z9.s, p2/M, z9.s, z31.s\n"
+ "smax z10.s, p2/M, z10.s, z31.s\n"
"uzp1 z15.h, z15.h, z9.h\n"
+ "smax z11.s, p2/M, z11.s, z31.s\n"
"uzp1 z16.h, z10.h, z11.h\n"
"uzp1 z15.b, z15.b, z16.b\n"
- "st1b { z15.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
+ "st1b { z15.b }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
"13:" // Height 1: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 2b\n"
"b 80f\n"
"14:" // Height 2
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"15:" // Height 2: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"16:" // Height 2: setup done
"mov x28, #0x0\n"
"17:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 18f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -349,90 +349,90 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 21f\n"
"20:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z19.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z18.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z16.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "trn1 z2.d, z18.d, z16.d\n"
- "trn2 z18.d, z18.d, z16.d\n"
- ".inst 0x45119848 // smmla z8.s, z2.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4513984c // smmla z12.s, z2.b, z19.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45119849 // smmla z9.s, z2.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4510984d // smmla z13.s, z2.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4511984a // smmla z10.s, z2.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4510984e // smmla z14.s, z2.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x4511984b // smmla z11.s, z2.b, z17.b\n"
- ".inst 0x4510984f // smmla z15.s, z2.b, z16.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n"
+ ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"bgt 20b\n"
"21:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"ld1rqb { z19.b }, p0/Z, [x25]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "addvl x9, x9, #8\n"
"ble 22f\n"
- "ld1b { z17.b }, p2/Z, [x10]\n"
- "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n"
".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
+ "addvl x9, x9, #8\n"
"22:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -449,16 +449,16 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
"ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "add x26, x11, x20\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x9, x20\n"
+ "addvl x14, x14, #4\n"
"mov z15.d, z20.d\n"
+ "add z15.s, z15.s, z19.s\n"
"add z12.s, z12.s, z18.s\n"
"add z13.s, z13.s, z17.s\n"
- "add z8.s, z8.s, z19.s\n"
- "add z15.s, z15.s, z19.s\n"
"add z14.s, z14.s, z16.s\n"
+ "add z8.s, z8.s, z19.s\n"
"add z9.s, z9.s, z18.s\n"
"add z10.s, z10.s, z17.s\n"
"add z11.s, z11.s, z16.s\n"
@@ -495,99 +495,99 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
"tbz %x[flags], #5, 25f\n"
- "and z18.d, z15.d, z0.d\n"
- "and z19.d, z12.d, z1.d\n"
+ "and z19.d, z15.d, z0.d\n"
+ "and z18.d, z12.d, z1.d\n"
"and z17.d, z13.d, z2.d\n"
"and z16.d, z14.d, z3.d\n"
- "asr z18.s, z18.s, #0x1f\n"
"asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z15.s, z15.s, z18.s\n"
- "and z18.d, z8.d, z0.d\n"
- "sqadd z12.s, z12.s, z19.s\n"
- "and z19.d, z9.d, z1.d\n"
+ "sqadd z15.s, z15.s, z19.s\n"
+ "sqadd z12.s, z12.s, z18.s\n"
"sqadd z13.s, z13.s, z17.s\n"
"sqadd z14.s, z14.s, z16.s\n"
+ "and z18.d, z8.d, z0.d\n"
+ "and z24.d, z9.d, z1.d\n"
"and z17.d, z10.d, z2.d\n"
"and z16.d, z11.d, z3.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "asr z19.s, z19.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
"sqadd z8.s, z8.s, z18.s\n"
- "sqadd z9.s, z9.s, z19.s\n"
+ "sqadd z9.s, z9.s, z24.s\n"
"sqadd z10.s, z10.s, z17.s\n"
"sqadd z11.s, z11.s, z16.s\n"
"25:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
+ "add z15.s, z15.s, z17.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
+ "add z12.s, z12.s, z17.s\n"
+ "add z13.s, z13.s, z17.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z14.s, z14.s, z17.s\n"
+ "add z8.s, z8.s, z17.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z15.s, z15.s, z18.s\n"
+ "add z9.s, z9.s, z17.s\n"
+ "add z10.s, z10.s, z17.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z12.s, z12.s, z18.s\n"
- "add z13.s, z13.s, z18.s\n"
- "ld1rw { z17.s }, p2/Z, [x20]\n"
- "add z14.s, z14.s, z18.s\n"
- "add z8.s, z8.s, z18.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z9.s, z9.s, z18.s\n"
- "add z10.s, z10.s, z18.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z11.s, z11.s, z18.s\n"
- "smin z15.s, p2/M, z15.s, z17.s\n"
- "smin z12.s, p2/M, z12.s, z17.s\n"
- "smin z13.s, p2/M, z13.s, z17.s\n"
- "smin z14.s, p2/M, z14.s, z17.s\n"
- "smin z8.s, p2/M, z8.s, z17.s\n"
- "smin z9.s, p2/M, z9.s, z17.s\n"
- "smin z10.s, p2/M, z10.s, z17.s\n"
- "smin z11.s, p2/M, z11.s, z17.s\n"
- "smax z15.s, p2/M, z15.s, z16.s\n"
- "smax z12.s, p2/M, z12.s, z16.s\n"
- "smax z13.s, p2/M, z13.s, z16.s\n"
- "smax z14.s, p2/M, z14.s, z16.s\n"
- "smax z8.s, p2/M, z8.s, z16.s\n"
- "smax z9.s, p2/M, z9.s, z16.s\n"
- "smax z10.s, p2/M, z10.s, z16.s\n"
- "smax z11.s, p2/M, z11.s, z16.s\n"
+ "add z11.s, z11.s, z17.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
+ "smin z15.s, p2/M, z15.s, z16.s\n"
+ "smin z12.s, p2/M, z12.s, z16.s\n"
+ "smin z13.s, p2/M, z13.s, z16.s\n"
+ "smin z14.s, p2/M, z14.s, z16.s\n"
+ "smin z8.s, p2/M, z8.s, z16.s\n"
+ "smin z9.s, p2/M, z9.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z16.s\n"
+ "smin z11.s, p2/M, z11.s, z16.s\n"
+ "smax z15.s, p2/M, z15.s, z17.s\n"
+ "smax z12.s, p2/M, z12.s, z17.s\n"
+ "smax z13.s, p2/M, z13.s, z17.s\n"
"uzp1 z15.h, z15.h, z12.h\n"
- "uzp1 z17.h, z13.h, z14.h\n"
+ "smax z14.s, p2/M, z14.s, z17.s\n"
+ "smax z8.s, p2/M, z8.s, z17.s\n"
+ "uzp1 z16.h, z13.h, z14.h\n"
+ "uzp1 z15.b, z15.b, z16.b\n"
+ "smax z9.s, p2/M, z9.s, z17.s\n"
+ "smax z10.s, p2/M, z10.s, z17.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
+ "st1b { z15.b }, p1, [x11]\n"
+ "smax z11.s, p2/M, z11.s, z17.s\n"
"uzp1 z16.h, z10.h, z11.h\n"
- "uzp1 z15.b, z15.b, z17.b\n"
"uzp1 z8.b, z8.b, z16.b\n"
- "st1b { z15.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
"st1b { z8.b }, p1, [x26]\n"
+ "addvl x11, x11, #1\n"
"26:" // Height 2: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 15b\n"
"b 80f\n"
"27:" // Height 3
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"28:" // Height 3: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -602,8 +602,8 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"mov x28, #0x0\n"
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -625,125 +625,125 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 34f\n"
"33:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z28.b }, p2/Z, [x10]\n"
- "ld1b { z25.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z30.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z24.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z29.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
"trn1 z27.d, z30.d, z24.d\n"
"trn2 z30.d, z30.d, z24.d\n"
- "trn1 z26.d, z29.d, z31.d\n"
- "trn2 z29.d, z29.d, z31.d\n"
- ".inst 0x451c9b68 // smmla z8.s, z27.b, z28.b\n"
- ".inst 0x45199b6c // smmla z12.s, z27.b, z25.b\n"
- ".inst 0x451c9b50 // smmla z16.s, z26.b, z28.b\n"
- "ld1b { z4.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45199b54 // smmla z20.s, z26.b, z25.b\n"
- "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45049b69 // smmla z9.s, z27.b, z4.b\n"
- ".inst 0x45049b51 // smmla z17.s, z26.b, z4.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
- ".inst 0x451c9b6d // smmla z13.s, z27.b, z28.b\n"
- ".inst 0x451c9b55 // smmla z21.s, z26.b, z28.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
+ ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "cmp x27, #0x10\n"
".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "add x25, x25, #0x10\n"
".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
- ".inst 0x45199bb0 // smmla z16.s, z29.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "add x24, x24, #0x10\n"
".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
- ".inst 0x45189bb4 // smmla z20.s, z29.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
- ".inst 0x45199bb1 // smmla z17.s, z29.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
- ".inst 0x45189bb5 // smmla z21.s, z29.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
- ".inst 0x45199bb2 // smmla z18.s, z29.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
- ".inst 0x45189bb6 // smmla z22.s, z29.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
- ".inst 0x45199bb3 // smmla z19.s, z29.b, z25.b\n"
+ ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
- ".inst 0x45189bb7 // smmla z23.s, z29.b, z24.b\n"
+ ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"ld1rqb { z24.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
"trn1 z27.d, z1.d, z24.d\n"
"trn2 z1.d, z1.d, z24.d\n"
- "trn1 z26.d, z3.d, z29.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
- ".inst 0x451c9b6c // smmla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z29.d\n"
".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451c9b54 // smmla z20.s, z26.b, z28.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #8\n"
".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 35f\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n"
".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n"
".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n"
".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #8\n"
".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n"
".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n"
".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n"
@@ -764,20 +764,20 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
"ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "add x26, x11, x20\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x9, x20\n"
+ "add x25, x26, x20\n"
+ "addvl x14, x14, #4\n"
"uzp1 z16.d, z16.d, z20.d\n"
"uzp1 z17.d, z17.d, z21.d\n"
- "add x25, x26, x20\n"
"uzp1 z18.d, z18.d, z22.d\n"
"uzp1 z19.d, z19.d, z23.d\n"
"mov z23.d, z28.d\n"
+ "add z23.s, z23.s, z27.s\n"
"add z12.s, z12.s, z26.s\n"
"add z13.s, z13.s, z25.s\n"
"add z14.s, z14.s, z24.s\n"
- "add z23.s, z23.s, z27.s\n"
"add z8.s, z8.s, z27.s\n"
"add z9.s, z9.s, z26.s\n"
"add z10.s, z10.s, z25.s\n"
@@ -832,11 +832,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z21.s, z21.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
"sqadd z23.s, z23.s, z24.s\n"
- "and z24.d, z8.d, z0.d\n"
"sqadd z12.s, z12.s, z22.s\n"
- "and z22.d, z9.d, z1.d\n"
"sqadd z13.s, z13.s, z21.s\n"
"sqadd z14.s, z14.s, z20.s\n"
+ "and z24.d, z8.d, z0.d\n"
+ "and z22.d, z9.d, z1.d\n"
"and z21.d, z10.d, z2.d\n"
"and z20.d, z11.d, z3.d\n"
"asr z24.s, z24.s, #0x1f\n"
@@ -844,11 +844,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z21.s, z21.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
"sqadd z8.s, z8.s, z24.s\n"
- "and z24.d, z16.d, z0.d\n"
"sqadd z9.s, z9.s, z22.s\n"
- "and z22.d, z17.d, z1.d\n"
"sqadd z10.s, z10.s, z21.s\n"
"sqadd z11.s, z11.s, z20.s\n"
+ "and z24.d, z16.d, z0.d\n"
+ "and z22.d, z17.d, z1.d\n"
"and z21.d, z18.d, z2.d\n"
"and z20.d, z19.d, z3.d\n"
"asr z24.s, z24.s, #0x1f\n"
@@ -861,93 +861,93 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"sqadd z19.s, z19.s, z20.s\n"
"38:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z22.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z21.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
+ "add z12.s, z12.s, z21.s\n"
+ "add z13.s, z13.s, z21.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z14.s, z14.s, z21.s\n"
+ "add z8.s, z8.s, z21.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z23.s, z23.s, z22.s\n"
+ "add z9.s, z9.s, z21.s\n"
+ "add z10.s, z10.s, z21.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z12.s, z12.s, z22.s\n"
- "add z13.s, z13.s, z22.s\n"
+ "add z11.s, z11.s, z21.s\n"
+ "add z16.s, z16.s, z21.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z14.s, z14.s, z22.s\n"
- "add z8.s, z8.s, z22.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z21.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z9.s, z9.s, z22.s\n"
- "add z10.s, z10.s, z22.s\n"
- "ld1rw { z21.s }, p2/Z, [x20]\n"
- "add z11.s, z11.s, z22.s\n"
- "add z16.s, z16.s, z22.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z22.s\n"
- "add z18.s, z18.s, z22.s\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z22.s\n"
- "smin z23.s, p2/M, z23.s, z21.s\n"
- "smin z12.s, p2/M, z12.s, z21.s\n"
- "smin z13.s, p2/M, z13.s, z21.s\n"
- "smin z14.s, p2/M, z14.s, z21.s\n"
- "smin z8.s, p2/M, z8.s, z21.s\n"
- "smin z9.s, p2/M, z9.s, z21.s\n"
- "smin z10.s, p2/M, z10.s, z21.s\n"
- "smin z11.s, p2/M, z11.s, z21.s\n"
- "smin z16.s, p2/M, z16.s, z21.s\n"
- "smin z17.s, p2/M, z17.s, z21.s\n"
- "smin z18.s, p2/M, z18.s, z21.s\n"
- "smin z19.s, p2/M, z19.s, z21.s\n"
- "smax z23.s, p2/M, z23.s, z20.s\n"
- "smax z12.s, p2/M, z12.s, z20.s\n"
- "smax z13.s, p2/M, z13.s, z20.s\n"
- "smax z14.s, p2/M, z14.s, z20.s\n"
- "smax z8.s, p2/M, z8.s, z20.s\n"
- "smax z9.s, p2/M, z9.s, z20.s\n"
- "smax z10.s, p2/M, z10.s, z20.s\n"
- "smax z11.s, p2/M, z11.s, z20.s\n"
+ "add z19.s, z19.s, z21.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z20.s\n"
+ "smin z12.s, p2/M, z12.s, z20.s\n"
+ "smin z13.s, p2/M, z13.s, z20.s\n"
+ "smin z14.s, p2/M, z14.s, z20.s\n"
+ "smin z8.s, p2/M, z8.s, z20.s\n"
+ "smin z9.s, p2/M, z9.s, z20.s\n"
+ "smin z10.s, p2/M, z10.s, z20.s\n"
+ "smin z11.s, p2/M, z11.s, z20.s\n"
+ "smin z16.s, p2/M, z16.s, z20.s\n"
+ "smin z17.s, p2/M, z17.s, z20.s\n"
+ "smin z18.s, p2/M, z18.s, z20.s\n"
+ "smin z19.s, p2/M, z19.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z21.s\n"
+ "smax z12.s, p2/M, z12.s, z21.s\n"
+ "smax z13.s, p2/M, z13.s, z21.s\n"
"uzp1 z23.h, z23.h, z12.h\n"
- "smax z16.s, p2/M, z16.s, z20.s\n"
- "smax z17.s, p2/M, z17.s, z20.s\n"
- "uzp1 z21.h, z13.h, z14.h\n"
- "smax z18.s, p2/M, z18.s, z20.s\n"
- "smax z19.s, p2/M, z19.s, z20.s\n"
+ "smax z14.s, p2/M, z14.s, z21.s\n"
+ "smax z8.s, p2/M, z8.s, z21.s\n"
+ "uzp1 z20.h, z13.h, z14.h\n"
+ "uzp1 z23.b, z23.b, z20.b\n"
+ "smax z9.s, p2/M, z9.s, z21.s\n"
+ "smax z10.s, p2/M, z10.s, z21.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
+ "st1b { z23.b }, p1, [x11]\n"
+ "smax z11.s, p2/M, z11.s, z21.s\n"
+ "smax z16.s, p2/M, z16.s, z21.s\n"
"uzp1 z20.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z20.b\n"
+ "smax z17.s, p2/M, z17.s, z21.s\n"
+ "smax z18.s, p2/M, z18.s, z21.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z23.b, z23.b, z21.b\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z21.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z8.b, z8.b, z20.b\n"
- "st1b { z23.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z8.b }, p1, [x26]\n"
"st1b { z16.b }, p1, [x25]\n"
+ "addvl x11, x11, #1\n"
"39:" // Height 3: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 28b\n"
"b 80f\n"
"40:" // Height 4
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"41:" // Height 4: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -962,8 +962,8 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"mov x28, #0x0\n"
"43:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 44f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -988,128 +988,128 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 47f\n"
"46:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z31.b }, p2/Z, [x10]\n"
- "ld1b { z30.b }, p2/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z29.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
"ld1rqb { z28.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "trn1 z27.d, z29.d, z25.d\n"
- "trn2 z29.d, z29.d, z25.d\n"
- "trn1 z26.d, z28.d, z24.d\n"
- "trn2 z28.d, z28.d, z24.d\n"
- ".inst 0x451f9b68 // smmla z8.s, z27.b, z31.b\n"
- ".inst 0x451e9b6c // smmla z12.s, z27.b, z30.b\n"
- ".inst 0x451f9b50 // smmla z16.s, z26.b, z31.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451e9b54 // smmla z20.s, z26.b, z30.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
- ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n"
".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
- ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 46b\n"
"47:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p2/Z, [x10]\n"
- "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "trn1 z27.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- "trn1 z26.d, z3.d, z24.d\n"
- ".inst 0x451d9b68 // smmla z8.s, z27.b, z29.b\n"
- ".inst 0x451c9b6c // smmla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z24.d\n"
- ".inst 0x451d9b50 // smmla z16.s, z26.b, z29.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451c9b54 // smmla z20.s, z26.b, z28.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199b88 // smmla z8.s, z28.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b8c // smmla z12.s, z28.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199b89 // smmla z9.s, z28.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
+ ".inst 0x45189b8d // smmla z13.s, z28.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x45199b8a // smmla z10.s, z28.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45189b8e // smmla z14.s, z28.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #8\n"
+ ".inst 0x45199b8b // smmla z11.s, z28.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
- ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ ".inst 0x45189b8f // smmla z15.s, z28.b, z24.b\n"
".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 48f\n"
- "ld1b { z25.b }, p2/Z, [x10]\n"
- "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n"
".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n"
".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n"
".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #8\n"
".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n"
".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n"
".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n"
@@ -1130,25 +1130,25 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
"ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "add x26, x11, x20\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x9, x20\n"
+ "add x25, x26, x20\n"
+ "add x24, x25, x20\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x25, x26, x20\n"
+ "addvl x14, x14, #4\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "add x24, x25, x20\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
"mov z23.d, z28.d\n"
+ "add z23.s, z23.s, z27.s\n"
"add z12.s, z12.s, z26.s\n"
"add z13.s, z13.s, z25.s\n"
"add z14.s, z14.s, z24.s\n"
- "add z23.s, z23.s, z27.s\n"
"add z8.s, z8.s, z27.s\n"
"add z9.s, z9.s, z26.s\n"
"add z10.s, z10.s, z25.s\n"
@@ -1211,11 +1211,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z25.s, z25.s, #0x1f\n"
"asr z24.s, z24.s, #0x1f\n"
"sqadd z23.s, z23.s, z27.s\n"
- "and z27.d, z8.d, z0.d\n"
"sqadd z12.s, z12.s, z26.s\n"
- "and z26.d, z9.d, z1.d\n"
"sqadd z13.s, z13.s, z25.s\n"
"sqadd z14.s, z14.s, z24.s\n"
+ "and z27.d, z8.d, z0.d\n"
+ "and z26.d, z9.d, z1.d\n"
"and z25.d, z10.d, z2.d\n"
"and z24.d, z11.d, z3.d\n"
"asr z27.s, z27.s, #0x1f\n"
@@ -1223,11 +1223,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z25.s, z25.s, #0x1f\n"
"asr z24.s, z24.s, #0x1f\n"
"sqadd z8.s, z8.s, z27.s\n"
- "and z27.d, z15.d, z0.d\n"
"sqadd z9.s, z9.s, z26.s\n"
- "and z26.d, z20.d, z1.d\n"
"sqadd z10.s, z10.s, z25.s\n"
"sqadd z11.s, z11.s, z24.s\n"
+ "and z27.d, z15.d, z0.d\n"
+ "and z26.d, z20.d, z1.d\n"
"and z25.d, z21.d, z2.d\n"
"and z24.d, z22.d, z3.d\n"
"asr z27.s, z27.s, #0x1f\n"
@@ -1235,11 +1235,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z25.s, z25.s, #0x1f\n"
"asr z24.s, z24.s, #0x1f\n"
"sqadd z15.s, z15.s, z27.s\n"
- "and z27.d, z16.d, z0.d\n"
"sqadd z20.s, z20.s, z26.s\n"
- "and z26.d, z17.d, z1.d\n"
"sqadd z21.s, z21.s, z25.s\n"
"sqadd z22.s, z22.s, z24.s\n"
+ "and z27.d, z16.d, z0.d\n"
+ "and z26.d, z17.d, z1.d\n"
"and z25.d, z18.d, z2.d\n"
"and z24.d, z19.d, z3.d\n"
"asr z27.s, z27.s, #0x1f\n"
@@ -1252,43 +1252,43 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"sqadd z19.s, z19.s, z24.s\n"
"51:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"ld1rw { z25.s }, p2/Z, [x20]\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ "add z23.s, z23.s, z25.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
+ "add z12.s, z12.s, z25.s\n"
+ "add z13.s, z13.s, z25.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z14.s, z14.s, z25.s\n"
+ "add z8.s, z8.s, z25.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z23.s, z23.s, z25.s\n"
+ "add z9.s, z9.s, z25.s\n"
+ "add z10.s, z10.s, z25.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z12.s, z12.s, z25.s\n"
- "add z13.s, z13.s, z25.s\n"
+ "add z11.s, z11.s, z25.s\n"
+ "add z15.s, z15.s, z25.s\n"
".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n"
".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n"
- "add z14.s, z14.s, z25.s\n"
- "add z8.s, z8.s, z25.s\n"
+ "add z20.s, z20.s, z25.s\n"
+ "add z21.s, z21.s, z25.s\n"
".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z25.s\n"
- "add z10.s, z10.s, z25.s\n"
+ "add z22.s, z22.s, z25.s\n"
+ "add z16.s, z16.s, z25.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z25.s\n"
- "add z15.s, z15.s, z25.s\n"
+ "add z17.s, z17.s, z25.s\n"
+ "add z18.s, z18.s, z25.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z20.s, z20.s, z25.s\n"
- "add z21.s, z21.s, z25.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z25.s\n"
- "add z16.s, z16.s, z25.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z25.s\n"
- "add z18.s, z18.s, z25.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
"add z19.s, z19.s, z25.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
"smin z23.s, p2/M, z23.s, z24.s\n"
"smin z12.s, p2/M, z12.s, z24.s\n"
"smin z13.s, p2/M, z13.s, z24.s\n"
@@ -1305,60 +1305,60 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"smin z17.s, p2/M, z17.s, z24.s\n"
"smin z18.s, p2/M, z18.s, z24.s\n"
"smin z19.s, p2/M, z19.s, z24.s\n"
- "smax z23.s, p2/M, z23.s, z26.s\n"
- "smax z12.s, p2/M, z12.s, z26.s\n"
- "smax z13.s, p2/M, z13.s, z26.s\n"
- "smax z14.s, p2/M, z14.s, z26.s\n"
- "smax z8.s, p2/M, z8.s, z26.s\n"
- "smax z9.s, p2/M, z9.s, z26.s\n"
- "smax z10.s, p2/M, z10.s, z26.s\n"
- "smax z11.s, p2/M, z11.s, z26.s\n"
+ "smax z23.s, p2/M, z23.s, z25.s\n"
+ "smax z12.s, p2/M, z12.s, z25.s\n"
+ "smax z13.s, p2/M, z13.s, z25.s\n"
"uzp1 z23.h, z23.h, z12.h\n"
- "smax z15.s, p2/M, z15.s, z26.s\n"
- "smax z20.s, p2/M, z20.s, z26.s\n"
- "uzp1 z25.h, z13.h, z14.h\n"
- "smax z21.s, p2/M, z21.s, z26.s\n"
- "smax z22.s, p2/M, z22.s, z26.s\n"
+ "smax z14.s, p2/M, z14.s, z25.s\n"
+ "smax z8.s, p2/M, z8.s, z25.s\n"
+ "uzp1 z24.h, z13.h, z14.h\n"
+ "uzp1 z23.b, z23.b, z24.b\n"
+ "smax z9.s, p2/M, z9.s, z25.s\n"
+ "smax z10.s, p2/M, z10.s, z25.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z26.s\n"
- "smax z17.s, p2/M, z17.s, z26.s\n"
- "uzp1 z24.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z26.s\n"
- "smax z19.s, p2/M, z19.s, z26.s\n"
+ "st1b { z23.b }, p1, [x11]\n"
+ "smax z11.s, p2/M, z11.s, z25.s\n"
+ "smax z15.s, p2/M, z15.s, z25.s\n"
+ "uzp1 z23.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z23.b\n"
+ "smax z20.s, p2/M, z20.s, z25.s\n"
+ "smax z21.s, p2/M, z21.s, z25.s\n"
"uzp1 z15.h, z15.h, z20.h\n"
- "uzp1 z23.b, z23.b, z25.b\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z22.s, p2/M, z22.s, z25.s\n"
+ "smax z16.s, p2/M, z16.s, z25.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z15.b, z15.b, z20.b\n"
+ "smax z17.s, p2/M, z17.s, z25.s\n"
+ "smax z18.s, p2/M, z18.s, z25.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z24.b\n"
+ "st1b { z15.b }, p1, [x25]\n"
+ "smax z19.s, p2/M, z19.s, z25.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "st1b { z23.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "uzp1 z15.b, z15.b, z20.b\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z8.b }, p1, [x26]\n"
- "st1b { z15.b }, p1, [x25]\n"
"st1b { z16.b }, p1, [x24]\n"
+ "addvl x11, x11, #1\n"
"52:" // Height 4: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 41b\n"
"b 80f\n"
"53:" // Height 5
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"54:" // Height 5: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -1381,8 +1381,8 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"mov x28, #0x0\n"
"56:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 57f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1410,165 +1410,165 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 60f\n"
"59:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p2/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x25]\n"
"ld1rqb { z7.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
"trn1 z3.d, z7.d, z2.d\n"
"trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45019888 // smmla z8.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x450198a8 // smmla z8.s, z5.b, z1.b\n"
".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n"
".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4500988c // smmla z12.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x450098ac // smmla z12.s, z5.b, z0.b\n"
".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45019889 // smmla z9.s, z4.b, z1.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x450198a9 // smmla z9.s, z5.b, z1.b\n"
+ "add x25, x25, #0x10\n"
".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n"
".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4500988d // smmla z13.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x450098ad // smmla z13.s, z5.b, z0.b\n"
".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4501988a // smmla z10.s, z4.b, z1.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x450198aa // smmla z10.s, z5.b, z1.b\n"
".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n"
".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4500988e // smmla z14.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x450098ae // smmla z14.s, z5.b, z0.b\n"
".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x4501988b // smmla z11.s, z4.b, z1.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ ".inst 0x450198ab // smmla z11.s, z5.b, z1.b\n"
".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n"
".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
- ".inst 0x4500988f // smmla z15.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ ".inst 0x450098af // smmla z15.s, z5.b, z0.b\n"
".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z0.b }, p2/Z, [x10, #-7, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n"
- ".inst 0x450198b8 // smmla z24.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45019898 // smmla z24.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n"
- ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n"
- ".inst 0x450198b9 // smmla z25.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45019899 // smmla z25.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n"
- ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n"
- ".inst 0x450198ba // smmla z26.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n"
- ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n"
- ".inst 0x450198bb // smmla z27.s, z5.b, z1.b\n"
+ ".inst 0x4501989b // smmla z27.s, z4.b, z1.b\n"
".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n"
- ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"bgt 59b\n"
"60:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p2/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n"
".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n"
".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n"
".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n"
".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
+ "addvl x9, x9, #8\n"
".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n"
".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"ble 61f\n"
- "ld1b { z2.b }, p2/Z, [x10]\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n"
".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #8\n"
".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n"
".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n"
".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n"
@@ -1583,27 +1583,27 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z4.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "ld1w { z3.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
+ "ld1w { z3.s }, p2/Z, [x14]\n"
"ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
+ "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
"ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x26, x9, x20\n"
+ "add x25, x26, x20\n"
+ "add x24, x25, x20\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x25, x26, x20\n"
+ "add x23, x24, x20\n"
+ "addvl x14, x14, #4\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "add x24, x25, x20\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x24, x20\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
"uzp1 z24.d, z24.d, z28.d\n"
@@ -1611,10 +1611,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
"mov z31.d, z4.d\n"
+ "add z31.s, z31.s, z3.s\n"
"add z12.s, z12.s, z2.s\n"
"add z13.s, z13.s, z1.s\n"
"add z14.s, z14.s, z0.s\n"
- "add z31.s, z31.s, z3.s\n"
"add z8.s, z8.s, z3.s\n"
"add z9.s, z9.s, z2.s\n"
"add z10.s, z10.s, z1.s\n"
@@ -1685,11 +1685,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z28.s, z28.s, #0x1f\n"
"asr z23.s, z23.s, #0x1f\n"
"sqadd z31.s, z31.s, z30.s\n"
- "and z30.d, z8.d, z0.d\n"
"sqadd z12.s, z12.s, z29.s\n"
- "and z29.d, z9.d, z1.d\n"
"sqadd z13.s, z13.s, z28.s\n"
"sqadd z14.s, z14.s, z23.s\n"
+ "and z30.d, z8.d, z0.d\n"
+ "and z29.d, z9.d, z1.d\n"
"and z28.d, z10.d, z2.d\n"
"and z23.d, z11.d, z3.d\n"
"asr z30.s, z30.s, #0x1f\n"
@@ -1697,11 +1697,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z28.s, z28.s, #0x1f\n"
"asr z23.s, z23.s, #0x1f\n"
"sqadd z8.s, z8.s, z30.s\n"
- "and z30.d, z15.d, z0.d\n"
"sqadd z9.s, z9.s, z29.s\n"
- "and z29.d, z20.d, z1.d\n"
"sqadd z10.s, z10.s, z28.s\n"
"sqadd z11.s, z11.s, z23.s\n"
+ "and z30.d, z15.d, z0.d\n"
+ "and z29.d, z20.d, z1.d\n"
"and z28.d, z21.d, z2.d\n"
"and z23.d, z22.d, z3.d\n"
"asr z30.s, z30.s, #0x1f\n"
@@ -1709,11 +1709,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z28.s, z28.s, #0x1f\n"
"asr z23.s, z23.s, #0x1f\n"
"sqadd z15.s, z15.s, z30.s\n"
- "and z30.d, z16.d, z0.d\n"
"sqadd z20.s, z20.s, z29.s\n"
- "and z29.d, z17.d, z1.d\n"
"sqadd z21.s, z21.s, z28.s\n"
"sqadd z22.s, z22.s, z23.s\n"
+ "and z30.d, z16.d, z0.d\n"
+ "and z29.d, z17.d, z1.d\n"
"and z28.d, z18.d, z2.d\n"
"and z23.d, z19.d, z3.d\n"
"asr z30.s, z30.s, #0x1f\n"
@@ -1721,11 +1721,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z28.s, z28.s, #0x1f\n"
"asr z23.s, z23.s, #0x1f\n"
"sqadd z16.s, z16.s, z30.s\n"
- "and z30.d, z24.d, z0.d\n"
"sqadd z17.s, z17.s, z29.s\n"
- "and z29.d, z25.d, z1.d\n"
"sqadd z18.s, z18.s, z28.s\n"
"sqadd z19.s, z19.s, z23.s\n"
+ "and z30.d, z24.d, z0.d\n"
+ "and z29.d, z25.d, z1.d\n"
"and z28.d, z26.d, z2.d\n"
"and z23.d, z27.d, z3.d\n"
"asr z30.s, z30.s, #0x1f\n"
@@ -1738,51 +1738,51 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"sqadd z27.s, z27.s, z23.s\n"
"64:" // Height 5: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "add z31.s, z31.s, z28.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
+ "add z12.s, z12.s, z28.s\n"
+ "add z13.s, z13.s, z28.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z14.s, z14.s, z28.s\n"
+ "add z8.s, z8.s, z28.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z31.s, z31.s, z28.s\n"
+ "add z9.s, z9.s, z28.s\n"
+ "add z10.s, z10.s, z28.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z12.s, z12.s, z28.s\n"
- "add z13.s, z13.s, z28.s\n"
+ "add z11.s, z11.s, z28.s\n"
+ "add z15.s, z15.s, z28.s\n"
".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n"
".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n"
- "add z14.s, z14.s, z28.s\n"
- "add z8.s, z8.s, z28.s\n"
+ "add z20.s, z20.s, z28.s\n"
+ "add z21.s, z21.s, z28.s\n"
".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z28.s\n"
- "add z10.s, z10.s, z28.s\n"
+ "add z22.s, z22.s, z28.s\n"
+ "add z16.s, z16.s, z28.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z28.s\n"
- "add z15.s, z15.s, z28.s\n"
+ "add z17.s, z17.s, z28.s\n"
+ "add z18.s, z18.s, z28.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z20.s, z20.s, z28.s\n"
- "add z21.s, z21.s, z28.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z24.s, z24.s, z28.s\n"
".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
- "add z22.s, z22.s, z28.s\n"
- "add z16.s, z16.s, z28.s\n"
+ "add z25.s, z25.s, z28.s\n"
+ "add z26.s, z26.s, z28.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z28.s\n"
- "add z18.s, z18.s, z28.s\n"
"ld1rw { z23.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z28.s\n"
- "add z24.s, z24.s, z28.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z28.s\n"
- "add z26.s, z26.s, z28.s\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
"add z27.s, z27.s, z28.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
"smin z31.s, p2/M, z31.s, z23.s\n"
"smin z12.s, p2/M, z12.s, z23.s\n"
"smin z13.s, p2/M, z13.s, z23.s\n"
@@ -1803,72 +1803,71 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"smin z25.s, p2/M, z25.s, z23.s\n"
"smin z26.s, p2/M, z26.s, z23.s\n"
"smin z27.s, p2/M, z27.s, z23.s\n"
- "smax z31.s, p2/M, z31.s, z29.s\n"
- "smax z12.s, p2/M, z12.s, z29.s\n"
- "smax z13.s, p2/M, z13.s, z29.s\n"
- "smax z14.s, p2/M, z14.s, z29.s\n"
- "smax z8.s, p2/M, z8.s, z29.s\n"
- "smax z9.s, p2/M, z9.s, z29.s\n"
- "smax z10.s, p2/M, z10.s, z29.s\n"
- "smax z11.s, p2/M, z11.s, z29.s\n"
+ "smax z31.s, p2/M, z31.s, z28.s\n"
+ "smax z12.s, p2/M, z12.s, z28.s\n"
+ "smax z13.s, p2/M, z13.s, z28.s\n"
"uzp1 z31.h, z31.h, z12.h\n"
- "smax z15.s, p2/M, z15.s, z29.s\n"
- "smax z20.s, p2/M, z20.s, z29.s\n"
- "uzp1 z28.h, z13.h, z14.h\n"
- "smax z21.s, p2/M, z21.s, z29.s\n"
- "smax z22.s, p2/M, z22.s, z29.s\n"
+ "smax z14.s, p2/M, z14.s, z28.s\n"
+ "smax z8.s, p2/M, z8.s, z28.s\n"
+ "uzp1 z23.h, z13.h, z14.h\n"
+ "uzp1 z31.b, z31.b, z23.b\n"
+ "smax z9.s, p2/M, z9.s, z28.s\n"
+ "smax z10.s, p2/M, z10.s, z28.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z29.s\n"
- "smax z17.s, p2/M, z17.s, z29.s\n"
+ "st1b { z31.b }, p1, [x11]\n"
+ "smax z11.s, p2/M, z11.s, z28.s\n"
+ "smax z15.s, p2/M, z15.s, z28.s\n"
"uzp1 z23.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z29.s\n"
- "smax z19.s, p2/M, z19.s, z29.s\n"
+ "uzp1 z8.b, z8.b, z23.b\n"
+ "smax z20.s, p2/M, z20.s, z28.s\n"
+ "smax z21.s, p2/M, z21.s, z28.s\n"
"uzp1 z15.h, z15.h, z20.h\n"
- "uzp1 z31.b, z31.b, z28.b\n"
- "smax z24.s, p2/M, z24.s, z29.s\n"
- "smax z25.s, p2/M, z25.s, z29.s\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z22.s, p2/M, z22.s, z28.s\n"
+ "smax z16.s, p2/M, z16.s, z28.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
- "smax z26.s, p2/M, z26.s, z29.s\n"
- "smax z27.s, p2/M, z27.s, z29.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z23.b\n"
- "uzp1 z18.h, z18.h, z19.h\n"
- "st1b { z31.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "uzp1 z24.h, z24.h, z25.h\n"
"uzp1 z15.b, z15.b, z20.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "st1b { z8.b }, p1, [x26]\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
+ "smax z17.s, p2/M, z17.s, z28.s\n"
+ "smax z18.s, p2/M, z18.s, z28.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"st1b { z15.b }, p1, [x25]\n"
+ "smax z19.s, p2/M, z19.s, z28.s\n"
+ "smax z24.s, p2/M, z24.s, z28.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "smax z25.s, p2/M, z25.s, z28.s\n"
+ "smax z26.s, p2/M, z26.s, z28.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
"st1b { z16.b }, p1, [x24]\n"
+ "smax z27.s, p2/M, z27.s, z28.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x23]\n"
+ "addvl x11, x11, #1\n"
"65:" // Height 5: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 54b\n"
"b 80f\n"
"66:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x6\n"
"mov x14, %x[col_bias]\n"
"ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
"ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"67:" // Height 6: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x10\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
- "whilelt p1.b, x20, x11\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
"mov z16.s, #0x0\n"
@@ -1891,8 +1890,8 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"mov x28, #0x0\n"
"69:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 70f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1923,168 +1922,168 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 73f\n"
"72:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p2/Z, [x10]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1b { z1.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
+ ".inst 0x45019890 // smmla z16.s, z4.b, z1.b\n"
+ ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
+ ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
+ ".inst 0x45009894 // smmla z20.s, z4.b, z0.b\n"
"cmp x27, #0x10\n"
- "ld1rqb { z6.b }, p0/Z, [x26]\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
+ ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
"add x25, x25, #0x10\n"
- "ld1rqb { z7.b }, p0/Z, [x24]\n"
+ ".inst 0x45019891 // smmla z17.s, z4.b, z1.b\n"
+ ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
+ ".inst 0x45009895 // smmla z21.s, z4.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "add x21, x21, #0x10\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45019888 // smmla z8.s, z4.b, z1.b\n"
- ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n"
- ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4500988c // smmla z12.s, z4.b, z0.b\n"
- ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
- ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45019889 // smmla z9.s, z4.b, z1.b\n"
- ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n"
- ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4500988d // smmla z13.s, z4.b, z0.b\n"
- ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4501988a // smmla z10.s, z4.b, z1.b\n"
- ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n"
- ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4500988e // smmla z14.s, z4.b, z0.b\n"
- ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
- ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x4501988b // smmla z11.s, z4.b, z1.b\n"
- ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n"
- ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
- ".inst 0x4500988f // smmla z15.s, z4.b, z0.b\n"
- ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
- ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z0.b }, p2/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
- ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n"
- ".inst 0x450198b8 // smmla z24.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
- ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n"
- ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
- ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n"
- ".inst 0x450198b9 // smmla z25.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
- ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n"
- ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
- ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n"
- ".inst 0x450198ba // smmla z26.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [x10, #-2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x45019892 // smmla z18.s, z4.b, z1.b\n"
+ ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
- ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n"
- ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45009896 // smmla z22.s, z4.b, z0.b\n"
+ ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
- ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n"
- ".inst 0x450198bb // smmla z27.s, z5.b, z1.b\n"
+ ".inst 0x45019893 // smmla z19.s, z4.b, z1.b\n"
+ ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
- ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n"
- ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
+ ".inst 0x45009897 // smmla z23.s, z4.b, z0.b\n"
+ ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
+ ".inst 0x450198b0 // smmla z16.s, z5.b, z1.b\n"
+ ".inst 0x45019878 // smmla z24.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098b4 // smmla z20.s, z5.b, z0.b\n"
+ ".inst 0x4500987c // smmla z28.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x450198e9 // smmla z9.s, z7.b, z1.b\n"
+ ".inst 0x450198b1 // smmla z17.s, z5.b, z1.b\n"
+ ".inst 0x45019879 // smmla z25.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098b5 // smmla z21.s, z5.b, z0.b\n"
+ ".inst 0x4500987d // smmla z29.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n"
+ ".inst 0x450198b2 // smmla z18.s, z5.b, z1.b\n"
+ ".inst 0x4501987a // smmla z26.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098b6 // smmla z22.s, z5.b, z0.b\n"
+ ".inst 0x4500987e // smmla z30.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ ".inst 0x450198eb // smmla z11.s, z7.b, z1.b\n"
+ ".inst 0x450198b3 // smmla z19.s, z5.b, z1.b\n"
+ ".inst 0x4501987b // smmla z27.s, z3.b, z1.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n"
+ ".inst 0x4500987f // smmla z31.s, z3.b, z0.b\n"
"bgt 72b\n"
"73:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p2/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
"ld1rqb { z0.b }, p0/Z, [x21]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n"
".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n"
".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n"
".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n"
".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
+ "addvl x9, x9, #8\n"
".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n"
".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"ble 74f\n"
- "ld1b { z2.b }, p2/Z, [x10]\n"
- "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n"
".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n"
".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n"
".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #8\n"
".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n"
".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n"
".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n"
@@ -2098,31 +2097,31 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"bne 69b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z4.d, z8.d, z12.d\n"
+ "add x26, x11, x20\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "ld1w { z3.s }, p2/Z, [x14]\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x25, x26, x20\n"
+ "ld1w { z3.s }, p2/Z, [x14]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "add x26, x9, x20\n"
+ "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
+ "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add x24, x25, x20\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x25, x26, x20\n"
+ "add x23, x24, x20\n"
+ "add x22, x23, x20\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "add x24, x25, x20\n"
+ "addvl x14, x14, #4\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x24, x20\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x22, x23, x20\n"
"uzp1 z23.d, z24.d, z28.d\n"
"uzp2 z24.d, z24.d, z28.d\n"
"uzp1 z28.d, z25.d, z29.d\n"
@@ -2132,10 +2131,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
"mov z31.d, z4.d\n"
+ "add z31.s, z31.s, z3.s\n"
"add z12.s, z12.s, z2.s\n"
"add z13.s, z13.s, z1.s\n"
"add z14.s, z14.s, z0.s\n"
- "add z31.s, z31.s, z3.s\n"
"add z8.s, z8.s, z3.s\n"
"add z9.s, z9.s, z2.s\n"
"add z10.s, z10.s, z1.s\n"
@@ -2214,11 +2213,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z31.s, z31.s, z7.s\n"
- "and z7.d, z8.d, z0.d\n"
"sqadd z12.s, z12.s, z6.s\n"
- "and z6.d, z9.d, z1.d\n"
"sqadd z13.s, z13.s, z5.s\n"
"sqadd z14.s, z14.s, z4.s\n"
+ "and z7.d, z8.d, z0.d\n"
+ "and z6.d, z9.d, z1.d\n"
"and z5.d, z10.d, z2.d\n"
"and z4.d, z11.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2226,11 +2225,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z8.s, z8.s, z7.s\n"
- "and z7.d, z15.d, z0.d\n"
"sqadd z9.s, z9.s, z6.s\n"
- "and z6.d, z20.d, z1.d\n"
"sqadd z10.s, z10.s, z5.s\n"
"sqadd z11.s, z11.s, z4.s\n"
+ "and z7.d, z15.d, z0.d\n"
+ "and z6.d, z20.d, z1.d\n"
"and z5.d, z21.d, z2.d\n"
"and z4.d, z22.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2238,11 +2237,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z15.s, z15.s, z7.s\n"
- "and z7.d, z16.d, z0.d\n"
"sqadd z20.s, z20.s, z6.s\n"
- "and z6.d, z17.d, z1.d\n"
"sqadd z21.s, z21.s, z5.s\n"
"sqadd z22.s, z22.s, z4.s\n"
+ "and z7.d, z16.d, z0.d\n"
+ "and z6.d, z17.d, z1.d\n"
"and z5.d, z18.d, z2.d\n"
"and z4.d, z19.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2250,11 +2249,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z16.s, z16.s, z7.s\n"
- "and z7.d, z23.d, z0.d\n"
"sqadd z17.s, z17.s, z6.s\n"
- "and z6.d, z28.d, z1.d\n"
"sqadd z18.s, z18.s, z5.s\n"
"sqadd z19.s, z19.s, z4.s\n"
+ "and z7.d, z23.d, z0.d\n"
+ "and z6.d, z28.d, z1.d\n"
"and z5.d, z29.d, z2.d\n"
"and z4.d, z30.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2262,11 +2261,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
"sqadd z23.s, z23.s, z7.s\n"
- "and z7.d, z24.d, z0.d\n"
"sqadd z28.s, z28.s, z6.s\n"
- "and z6.d, z25.d, z1.d\n"
"sqadd z29.s, z29.s, z5.s\n"
"sqadd z30.s, z30.s, z4.s\n"
+ "and z7.d, z24.d, z0.d\n"
+ "and z6.d, z25.d, z1.d\n"
"and z5.d, z26.d, z2.d\n"
"and z4.d, z27.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
@@ -2279,59 +2278,59 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"sqadd z27.s, z27.s, z4.s\n"
"77:" // Height 6: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "add z31.s, z31.s, z4.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "add z13.s, z13.s, z4.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z8.s, z8.s, z4.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z31.s, z31.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z12.s, z12.s, z4.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n"
".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n"
- "add z14.s, z14.s, z4.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "add z21.s, z21.s, z4.s\n"
".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z15.s, z15.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
".inst 0x4482883c // srshl z28.s, p2/M, z28.s, z1.s\n"
".inst 0x4482885d // srshl z29.s, p2/M, z29.s, z2.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z28.s, z28.s, z4.s\n"
+ "add z29.s, z29.s, z4.s\n"
".inst 0x4482887e // srshl z30.s, p2/M, z30.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "add z26.s, z26.s, z4.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z28.s, z28.s, z4.s\n"
- "add z29.s, z29.s, z4.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
- "ld1rw { z2.s }, p2/Z, [x20]\n"
"add z27.s, z27.s, z4.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"smin z31.s, p2/M, z31.s, z0.s\n"
"smin z12.s, p2/M, z12.s, z0.s\n"
"smin z13.s, p2/M, z13.s, z0.s\n"
@@ -2356,58 +2355,58 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"smin z25.s, p2/M, z25.s, z0.s\n"
"smin z26.s, p2/M, z26.s, z0.s\n"
"smin z27.s, p2/M, z27.s, z0.s\n"
- "smax z31.s, p2/M, z31.s, z2.s\n"
- "smax z12.s, p2/M, z12.s, z2.s\n"
- "smax z13.s, p2/M, z13.s, z2.s\n"
- "smax z14.s, p2/M, z14.s, z2.s\n"
- "smax z8.s, p2/M, z8.s, z2.s\n"
- "smax z9.s, p2/M, z9.s, z2.s\n"
- "smax z10.s, p2/M, z10.s, z2.s\n"
- "smax z11.s, p2/M, z11.s, z2.s\n"
+ "smax z31.s, p2/M, z31.s, z1.s\n"
+ "smax z12.s, p2/M, z12.s, z1.s\n"
+ "smax z13.s, p2/M, z13.s, z1.s\n"
"uzp1 z31.h, z31.h, z12.h\n"
- "smax z15.s, p2/M, z15.s, z2.s\n"
- "smax z20.s, p2/M, z20.s, z2.s\n"
- "uzp1 z1.h, z13.h, z14.h\n"
- "smax z21.s, p2/M, z21.s, z2.s\n"
- "smax z22.s, p2/M, z22.s, z2.s\n"
+ "smax z14.s, p2/M, z14.s, z1.s\n"
+ "smax z8.s, p2/M, z8.s, z1.s\n"
+ "uzp1 z0.h, z13.h, z14.h\n"
+ "uzp1 z31.b, z31.b, z0.b\n"
+ "smax z9.s, p2/M, z9.s, z1.s\n"
+ "smax z10.s, p2/M, z10.s, z1.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z16.s, p2/M, z16.s, z2.s\n"
- "smax z17.s, p2/M, z17.s, z2.s\n"
- "uzp1 z0.h, z10.h, z11.h\n"
- "smax z18.s, p2/M, z18.s, z2.s\n"
- "smax z19.s, p2/M, z19.s, z2.s\n"
+ "st1b { z31.b }, p1, [x11]\n"
+ "smax z11.s, p2/M, z11.s, z1.s\n"
+ "smax z15.s, p2/M, z15.s, z1.s\n"
+ "uzp1 z31.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z31.b\n"
+ "smax z20.s, p2/M, z20.s, z1.s\n"
+ "smax z21.s, p2/M, z21.s, z1.s\n"
"uzp1 z15.h, z15.h, z20.h\n"
- "uzp1 z31.b, z31.b, z1.b\n"
- "smax z23.s, p2/M, z23.s, z2.s\n"
- "smax z28.s, p2/M, z28.s, z2.s\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z22.s, p2/M, z22.s, z1.s\n"
+ "smax z16.s, p2/M, z16.s, z1.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
- "smax z29.s, p2/M, z29.s, z2.s\n"
- "smax z30.s, p2/M, z30.s, z2.s\n"
+ "uzp1 z15.b, z15.b, z20.b\n"
+ "smax z17.s, p2/M, z17.s, z1.s\n"
+ "smax z18.s, p2/M, z18.s, z1.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z8.b, z8.b, z0.b\n"
- "smax z24.s, p2/M, z24.s, z2.s\n"
- "smax z25.s, p2/M, z25.s, z2.s\n"
+ "st1b { z15.b }, p1, [x25]\n"
+ "smax z19.s, p2/M, z19.s, z1.s\n"
+ "smax z23.s, p2/M, z23.s, z1.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "st1b { z31.b }, p1, [x9]\n"
- "smax z26.s, p2/M, z26.s, z2.s\n"
- "smax z27.s, p2/M, z27.s, z2.s\n"
- "uzp1 z23.h, z23.h, z28.h\n"
- "uzp1 z15.b, z15.b, z20.b\n"
- "uzp1 z18.h, z29.h, z30.h\n"
- "st1b { z8.b }, p1, [x26]\n"
- "addvl x9, x9, #1\n"
- "uzp1 z24.h, z24.h, z25.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "st1b { z15.b }, p1, [x25]\n"
- "uzp1 z23.b, z23.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
+ "smax z28.s, p2/M, z28.s, z1.s\n"
+ "smax z29.s, p2/M, z29.s, z1.s\n"
+ "uzp1 z23.h, z23.h, z28.h\n"
"st1b { z16.b }, p1, [x24]\n"
+ "smax z30.s, p2/M, z30.s, z1.s\n"
+ "smax z24.s, p2/M, z24.s, z1.s\n"
+ "uzp1 z16.h, z29.h, z30.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z1.s\n"
+ "smax z26.s, p2/M, z26.s, z1.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
"st1b { z23.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z1.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x22]\n"
+ "addvl x11, x11, #1\n"
"78:" // Height 6: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
"bgt 67b\n"
"subs %x[M], %x[M], #0x6\n"
"beq 80f\n"
@@ -2421,8 +2420,8 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index dc377f56e8..a6abb8d354 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
index f94f76564b..92a350c8a8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
@@ -44,18 +44,18 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -89,7 +89,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"beq 11f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -114,8 +114,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -134,14 +134,14 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop
"sdot z8.s, z6.b, z0.b\n"
- "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x4\n"
- "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b\n"
"sdot z11.s, z16.b, z0.b\n"
+ "subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -149,14 +149,14 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"9:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"sdot z8.s, z6.b, z0.b\n"
- "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"sdot z10.s, z17.b, z0.b\n"
"sdot z11.s, z16.b, z0.b\n"
+ "addvl x10, x10, #4\n"
"bne 5b\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -171,7 +171,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"11:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"12:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -183,11 +183,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 13f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x20]\n"
"ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
@@ -206,8 +206,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"15:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -242,8 +242,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z10.s, z17.b, z0.b\n"
"sdot z14.s, z17.b, z1.b\n"
"sdot z11.s, z16.b, z0.b\n"
- "ld1rw { z0.s }, p4/Z, [x26]\n"
"sdot z15.s, z16.b, z1.b\n"
+ "ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -257,18 +257,18 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z13.s, z7.b, z1.b\n"
"ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"sdot z10.s, z17.b, z0.b\n"
"sdot z14.s, z17.b, z1.b\n"
+ "addvl x10, x10, #4\n"
"sdot z11.s, z16.b, z0.b\n"
"sdot z15.s, z16.b, z1.b\n"
"bne 15b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p3, [x20]\n"
@@ -283,7 +283,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"21:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"22:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -295,12 +295,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x21]\n"
"ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
@@ -327,8 +327,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"25:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 26f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -359,8 +359,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x26, x26, #0x4\n"
"subs x27, x27, #0x4\n"
"sdot z16.s, z6.b, z2.b\n"
- "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
@@ -372,11 +372,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z18.s, z21.b, z2.b\n"
"sdot z11.s, z20.b, z0.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
+ "ld1b { z6.b }, p4/Z, [x10]\n"
"sdot z15.s, z20.b, z1.b\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"sdot z19.s, z20.b, z2.b\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 28b\n"
"29:" // Height 3: Multiply loop: Main loop skip
@@ -385,13 +385,13 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z12.s, z6.b, z1.b\n"
"add x28, x28, #0x1\n"
"sdot z16.s, z6.b, z2.b\n"
- "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
"ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "cmp x28, x20\n"
"sdot z10.s, z21.b, z0.b\n"
"sdot z14.s, z21.b, z1.b\n"
"sdot z18.s, z21.b, z2.b\n"
@@ -400,11 +400,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z19.s, z20.b, z2.b\n"
"bne 25b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p3, [x21]\n"
@@ -423,7 +423,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"31:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"32:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -435,13 +435,13 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x22]\n"
"ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
@@ -476,8 +476,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"35:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -527,6 +527,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z14.s, z25.b, z1.b\n"
"sdot z18.s, z25.b, z2.b\n"
"sdot z22.s, z25.b, z3.b\n"
+ "ld1b { z6.b }, p4/Z, [x10]\n"
"sdot z11.s, z24.b, z0.b\n"
"sdot z15.s, z24.b, z1.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -535,7 +536,6 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z23.s, z24.b, z3.b\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
- "ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 38b\n"
"39:" // Height 4: Multiply loop: Main loop skip
@@ -546,15 +546,15 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z16.s, z6.b, z2.b\n"
"sdot z20.s, z6.b, z3.b\n"
"ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"sdot z9.s, z7.b, z0.b\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
"sdot z21.s, z7.b, z3.b\n"
"ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #4\n"
"sdot z10.s, z25.b, z0.b\n"
"sdot z14.s, z25.b, z1.b\n"
- "addvl x10, x10, #4\n"
"sdot z18.s, z25.b, z2.b\n"
"sdot z22.s, z25.b, z3.b\n"
"sdot z11.s, z24.b, z0.b\n"
@@ -563,12 +563,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z23.s, z24.b, z3.b\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p3, [x9]\n"
- "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p3, [x22]\n"
@@ -591,7 +591,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"41:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"42:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -603,16 +603,16 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x23]\n"
"ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x22]\n"
@@ -653,8 +653,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"45:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -697,8 +697,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
"sdot z24.s, z6.b, z4.b\n"
- "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x4\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
@@ -716,12 +716,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
"sdot z15.s, z28.b, z1.b\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"sdot z19.s, z28.b, z2.b\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"sdot z23.s, z28.b, z3.b\n"
- "ld1rw { z3.s }, p4/Z, [x23]\n"
"sdot z27.s, z28.b, z4.b\n"
+ "ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 48b\n"
@@ -732,12 +732,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"sdot z16.s, z6.b, z2.b\n"
"sdot z20.s, z6.b, z3.b\n"
+ "cmp x28, x20\n"
"sdot z24.s, z6.b, z4.b\n"
- "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
- "cmp x28, x20\n"
"sdot z21.s, z7.b, z3.b\n"
"sdot z25.s, z7.b, z4.b\n"
"ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
@@ -754,15 +754,15 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z27.s, z28.b, z4.b\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p3, [x9]\n"
- "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z12.s }, p3, [x23]\n"
"st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
"st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
@@ -786,12 +786,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"b 62f\n"
"51:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"52:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -803,17 +802,17 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 53f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x24]\n"
"ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x23]\n"
@@ -862,8 +861,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"55:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 56f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -951,12 +950,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"sdot z16.s, z6.b, z2.b\n"
"sdot z20.s, z6.b, z3.b\n"
+ "cmp x28, x20\n"
"sdot z24.s, z6.b, z4.b\n"
"sdot z28.s, z6.b, z5.b\n"
"ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
"sdot z13.s, z7.b, z1.b\n"
- "cmp x28, x20\n"
"sdot z17.s, z7.b, z2.b\n"
"sdot z21.s, z7.b, z3.b\n"
"sdot z25.s, z7.b, z4.b\n"
@@ -977,17 +976,17 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"sdot z31.s, z7.b, z5.b\n"
"bne 55b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p3, [x9]\n"
- "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x21, x22, x20, LSL #2\n"
"st1w { z12.s }, p3, [x24]\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
"st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
"st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
@@ -1023,8 +1022,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"62:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
index 70362ae888..f9b84e26fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -44,18 +44,18 @@ void sve_hybrid_s8s32_dot_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -89,7 +89,7 @@ void sve_hybrid_s8s32_dot_6x4VL (
"beq 12f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -114,8 +114,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov x28, #0x0\n"
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -131,89 +131,89 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "sdot z8.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[0]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"sdot z11.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "sdot z8.s, z16.b, z0.b[1]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- "sdot z8.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[1]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z10.s, z17.b, z0.b[1]\n"
"sdot z11.s, z16.b, z0.b[1]\n"
"ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
"sdot z10.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
"sdot z11.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
"sdot z10.s, z17.b, z0.b[3]\n"
"sdot z11.s, z16.b, z0.b[3]\n"
+ "add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "sdot z8.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"sdot z10.s, z17.b, z0.b[0]\n"
"sdot z11.s, z16.b, z0.b[0]\n"
+ "addvl x10, x10, #4\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[1]\n"
"sdot z11.s, z16.b, z0.b[1]\n"
+ "addvl x10, x10, #4\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[2]\n"
"sdot z11.s, z16.b, z0.b[2]\n"
+ "addvl x10, x10, #4\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"sdot z10.s, z17.b, z0.b[3]\n"
"sdot z11.s, z16.b, z0.b[3]\n"
+ "addvl x10, x10, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -232,7 +232,7 @@ void sve_hybrid_s8s32_dot_6x4VL (
"12:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"13:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -244,11 +244,11 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
@@ -267,8 +267,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov x28, #0x0\n"
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -287,38 +287,38 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z17.b, z1.b[0]\n"
"sdot z12.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z1.b[0]\n"
"sdot z13.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"sdot z10.s, z17.b, z1.b[0]\n"
"sdot z14.s, z17.b, z0.b[0]\n"
"ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "cmp x27, #0x10\n"
"sdot z11.s, z16.b, z1.b[0]\n"
"sdot z15.s, z16.b, z0.b[0]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"sdot z8.s, z17.b, z1.b[1]\n"
"sdot z12.s, z17.b, z0.b[1]\n"
"ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z16.b, z1.b[1]\n"
"sdot z13.s, z16.b, z0.b[1]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"sdot z10.s, z17.b, z1.b[1]\n"
"sdot z14.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"sdot z11.s, z16.b, z1.b[1]\n"
"sdot z15.s, z16.b, z0.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
"sdot z8.s, z17.b, z1.b[2]\n"
"sdot z12.s, z17.b, z0.b[2]\n"
@@ -345,50 +345,50 @@ void sve_hybrid_s8s32_dot_6x4VL (
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[0]\n"
"sdot z12.s, z17.b, z1.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[0]\n"
"sdot z13.s, z16.b, z1.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"sdot z10.s, z17.b, z0.b[0]\n"
"sdot z14.s, z17.b, z1.b[0]\n"
+ "addvl x10, x10, #4\n"
"sdot z11.s, z16.b, z0.b[0]\n"
"sdot z15.s, z16.b, z1.b[0]\n"
"ble 21f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z17.b, z0.b[1]\n"
"sdot z12.s, z17.b, z1.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[1]\n"
"sdot z13.s, z16.b, z1.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[1]\n"
"sdot z14.s, z17.b, z1.b[1]\n"
+ "addvl x10, x10, #4\n"
"sdot z11.s, z16.b, z0.b[1]\n"
"sdot z15.s, z16.b, z1.b[1]\n"
"ble 21f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z17.b, z0.b[2]\n"
"sdot z12.s, z17.b, z1.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[2]\n"
"sdot z13.s, z16.b, z1.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"sdot z10.s, z17.b, z0.b[2]\n"
"sdot z14.s, z17.b, z1.b[2]\n"
+ "addvl x10, x10, #4\n"
"sdot z11.s, z16.b, z0.b[2]\n"
"sdot z15.s, z16.b, z1.b[2]\n"
"ble 21f\n"
@@ -396,13 +396,13 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z17.b, z0.b[3]\n"
"sdot z12.s, z17.b, z1.b[3]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z16.b, z0.b[3]\n"
"sdot z13.s, z16.b, z1.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"sdot z10.s, z17.b, z0.b[3]\n"
"sdot z14.s, z17.b, z1.b[3]\n"
+ "addvl x10, x10, #4\n"
"sdot z11.s, z16.b, z0.b[3]\n"
"sdot z15.s, z16.b, z1.b[3]\n"
"21:" // Height 2: Multiply loop: multiply skip
@@ -411,10 +411,10 @@ void sve_hybrid_s8s32_dot_6x4VL (
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p4, [x20]\n"
@@ -429,7 +429,7 @@ void sve_hybrid_s8s32_dot_6x4VL (
"23:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"24:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -441,12 +441,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
@@ -473,8 +473,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov x28, #0x0\n"
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -496,37 +496,37 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z21.b }, p5/Z, [x10]\n"
- "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
"sdot z8.s, z21.b, z2.b[0]\n"
"sdot z12.s, z21.b, z1.b[0]\n"
- "sdot z9.s, z20.b, z2.b[0]\n"
- "sdot z13.s, z20.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z16.s, z21.b, z0.b[0]\n"
+ "sdot z9.s, z20.b, z2.b[0]\n"
"ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[0]\n"
"sdot z17.s, z20.b, z0.b[0]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x27, #0x10\n"
"sdot z10.s, z21.b, z2.b[0]\n"
"sdot z14.s, z21.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"sdot z18.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
"sdot z11.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"sdot z15.s, z20.b, z1.b[0]\n"
"sdot z19.s, z20.b, z0.b[0]\n"
"ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
"sdot z8.s, z21.b, z2.b[1]\n"
"sdot z12.s, z21.b, z1.b[1]\n"
"sdot z16.s, z21.b, z0.b[1]\n"
- "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
"sdot z9.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[1]\n"
"sdot z17.s, z20.b, z0.b[1]\n"
"ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
@@ -535,31 +535,31 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z14.s, z21.b, z1.b[1]\n"
"sdot z18.s, z21.b, z0.b[1]\n"
"sdot z11.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
"sdot z15.s, z20.b, z1.b[1]\n"
"sdot z19.s, z20.b, z0.b[1]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
"sdot z8.s, z21.b, z2.b[2]\n"
"sdot z12.s, z21.b, z1.b[2]\n"
"sdot z16.s, z21.b, z0.b[2]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
"sdot z9.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[2]\n"
"sdot z17.s, z20.b, z0.b[2]\n"
"ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
"sdot z10.s, z21.b, z2.b[2]\n"
"sdot z14.s, z21.b, z1.b[2]\n"
"sdot z18.s, z21.b, z0.b[2]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
"sdot z11.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
"sdot z15.s, z20.b, z1.b[2]\n"
"sdot z19.s, z20.b, z0.b[2]\n"
"ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
"sdot z8.s, z21.b, z2.b[3]\n"
"sdot z12.s, z21.b, z1.b[3]\n"
"sdot z16.s, z21.b, z0.b[3]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
"sdot z9.s, z20.b, z2.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[3]\n"
"sdot z17.s, z20.b, z0.b[3]\n"
"ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
@@ -572,18 +572,18 @@ void sve_hybrid_s8s32_dot_6x4VL (
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z21.b }, p5/Z, [x10]\n"
- "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
"sdot z8.s, z21.b, z0.b[0]\n"
"sdot z12.s, z21.b, z1.b[0]\n"
- "sdot z9.s, z20.b, z0.b[0]\n"
- "sdot z13.s, z20.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z16.s, z21.b, z2.b[0]\n"
+ "sdot z9.s, z20.b, z0.b[0]\n"
"ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[0]\n"
"sdot z17.s, z20.b, z2.b[0]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -596,12 +596,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 32f\n"
"ld1b { z21.b }, p5/Z, [x10]\n"
"ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z21.b, z0.b[1]\n"
"sdot z12.s, z21.b, z1.b[1]\n"
"sdot z16.s, z21.b, z2.b[1]\n"
- "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z13.s, z20.b, z1.b[1]\n"
"sdot z17.s, z20.b, z2.b[1]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -615,12 +615,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 32f\n"
"ld1b { z21.b }, p5/Z, [x10]\n"
"ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z21.b, z0.b[2]\n"
"sdot z12.s, z21.b, z1.b[2]\n"
"sdot z16.s, z21.b, z2.b[2]\n"
- "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z13.s, z20.b, z1.b[2]\n"
"sdot z17.s, z20.b, z2.b[2]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -637,8 +637,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z8.s, z21.b, z0.b[3]\n"
"sdot z12.s, z21.b, z1.b[3]\n"
"sdot z16.s, z21.b, z2.b[3]\n"
- "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z20.b, z1.b[3]\n"
"sdot z17.s, z20.b, z2.b[3]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -655,11 +655,11 @@ void sve_hybrid_s8s32_dot_6x4VL (
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p4, [x21]\n"
@@ -678,7 +678,7 @@ void sve_hybrid_s8s32_dot_6x4VL (
"34:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"35:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -690,13 +690,13 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
@@ -731,8 +731,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov x28, #0x0\n"
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -757,25 +757,25 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z25.b, z3.b[0]\n"
"sdot z12.s, z25.b, z2.b[0]\n"
- "sdot z9.s, z24.b, z3.b[0]\n"
- "sdot z13.s, z24.b, z2.b[0]\n"
"sdot z16.s, z25.b, z1.b[0]\n"
"sdot z20.s, z25.b, z0.b[0]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ "sdot z9.s, z24.b, z3.b[0]\n"
+ "sdot z13.s, z24.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z24.b, z1.b[0]\n"
"sdot z21.s, z24.b, z0.b[0]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -804,9 +804,9 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z14.s, z25.b, z2.b[1]\n"
"sdot z18.s, z25.b, z1.b[1]\n"
"sdot z22.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"sdot z11.s, z24.b, z3.b[1]\n"
"sdot z15.s, z24.b, z2.b[1]\n"
- "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"sdot z19.s, z24.b, z1.b[1]\n"
"sdot z23.s, z24.b, z0.b[1]\n"
"ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
@@ -851,20 +851,20 @@ void sve_hybrid_s8s32_dot_6x4VL (
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z25.b, z0.b[0]\n"
"sdot z12.s, z25.b, z1.b[0]\n"
- "sdot z9.s, z24.b, z0.b[0]\n"
- "sdot z13.s, z24.b, z1.b[0]\n"
"sdot z16.s, z25.b, z2.b[0]\n"
"sdot z20.s, z25.b, z3.b[0]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[0]\n"
+ "sdot z13.s, z24.b, z1.b[0]\n"
"sdot z17.s, z24.b, z2.b[0]\n"
"sdot z21.s, z24.b, z3.b[0]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -880,12 +880,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 43f\n"
"ld1b { z25.b }, p5/Z, [x10]\n"
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z25.b, z0.b[1]\n"
"sdot z12.s, z25.b, z1.b[1]\n"
"sdot z16.s, z25.b, z2.b[1]\n"
"sdot z20.s, z25.b, z3.b[1]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z9.s, z24.b, z0.b[1]\n"
"sdot z13.s, z24.b, z1.b[1]\n"
"sdot z17.s, z24.b, z2.b[1]\n"
@@ -903,12 +903,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 43f\n"
"ld1b { z25.b }, p5/Z, [x10]\n"
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z25.b, z0.b[2]\n"
"sdot z12.s, z25.b, z1.b[2]\n"
"sdot z16.s, z25.b, z2.b[2]\n"
"sdot z20.s, z25.b, z3.b[2]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"sdot z9.s, z24.b, z0.b[2]\n"
"sdot z13.s, z24.b, z1.b[2]\n"
"sdot z17.s, z24.b, z2.b[2]\n"
@@ -951,12 +951,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p4, [x22]\n"
@@ -979,7 +979,7 @@ void sve_hybrid_s8s32_dot_6x4VL (
"45:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"46:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -991,16 +991,16 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22]\n"
@@ -1041,8 +1041,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1070,29 +1070,29 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z4.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
"sdot z8.s, z29.b, z4.b[0]\n"
"sdot z12.s, z29.b, z3.b[0]\n"
- "sdot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z16.s, z29.b, z2.b[0]\n"
"sdot z20.s, z29.b, z1.b[0]\n"
+ "add x25, x25, #0x10\n"
"sdot z24.s, z29.b, z0.b[0]\n"
- "sdot z13.s, z28.b, z3.b[0]\n"
+ "sdot z9.s, z28.b, z4.b[0]\n"
"ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z28.b, z3.b[0]\n"
"sdot z17.s, z28.b, z2.b[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"sdot z21.s, z28.b, z1.b[0]\n"
"sdot z25.s, z28.b, z0.b[0]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1101,8 +1101,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z18.s, z29.b, z2.b[0]\n"
"sdot z22.s, z29.b, z1.b[0]\n"
"sdot z26.s, z29.b, z0.b[0]\n"
- "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
"sdot z11.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
"sdot z15.s, z28.b, z3.b[0]\n"
"sdot z19.s, z28.b, z2.b[0]\n"
"sdot z23.s, z28.b, z1.b[0]\n"
@@ -1113,8 +1113,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z16.s, z29.b, z2.b[1]\n"
"sdot z20.s, z29.b, z1.b[1]\n"
"sdot z24.s, z29.b, z0.b[1]\n"
- "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
"sdot z9.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
"sdot z13.s, z28.b, z3.b[1]\n"
"sdot z17.s, z28.b, z2.b[1]\n"
"sdot z21.s, z28.b, z1.b[1]\n"
@@ -1127,8 +1127,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z22.s, z29.b, z1.b[1]\n"
"sdot z26.s, z29.b, z0.b[1]\n"
"sdot z11.s, z28.b, z4.b[1]\n"
- "sdot z15.s, z28.b, z3.b[1]\n"
"ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[1]\n"
"sdot z19.s, z28.b, z2.b[1]\n"
"sdot z23.s, z28.b, z1.b[1]\n"
"sdot z27.s, z28.b, z0.b[1]\n"
@@ -1138,8 +1138,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z16.s, z29.b, z2.b[2]\n"
"sdot z20.s, z29.b, z1.b[2]\n"
"sdot z24.s, z29.b, z0.b[2]\n"
- "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
"sdot z9.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
"sdot z13.s, z28.b, z3.b[2]\n"
"sdot z17.s, z28.b, z2.b[2]\n"
"sdot z21.s, z28.b, z1.b[2]\n"
@@ -1150,8 +1150,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z18.s, z29.b, z2.b[2]\n"
"sdot z22.s, z29.b, z1.b[2]\n"
"sdot z26.s, z29.b, z0.b[2]\n"
- "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
"sdot z11.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
"sdot z15.s, z28.b, z3.b[2]\n"
"sdot z19.s, z28.b, z2.b[2]\n"
"sdot z23.s, z28.b, z1.b[2]\n"
@@ -1162,8 +1162,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z16.s, z29.b, z2.b[3]\n"
"sdot z20.s, z29.b, z1.b[3]\n"
"sdot z24.s, z29.b, z0.b[3]\n"
- "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
"sdot z9.s, z28.b, z4.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
"sdot z13.s, z28.b, z3.b[3]\n"
"sdot z17.s, z28.b, z2.b[3]\n"
"sdot z21.s, z28.b, z1.b[3]\n"
@@ -1182,23 +1182,23 @@ void sve_hybrid_s8s32_dot_6x4VL (
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
"sdot z8.s, z29.b, z0.b[0]\n"
"sdot z12.s, z29.b, z1.b[0]\n"
- "sdot z9.s, z28.b, z0.b[0]\n"
- "sdot z13.s, z28.b, z1.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z16.s, z29.b, z2.b[0]\n"
"sdot z20.s, z29.b, z3.b[0]\n"
"sdot z24.s, z29.b, z4.b[0]\n"
- "sdot z17.s, z28.b, z2.b[0]\n"
+ "sdot z9.s, z28.b, z0.b[0]\n"
"ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[0]\n"
+ "sdot z17.s, z28.b, z2.b[0]\n"
"sdot z21.s, z28.b, z3.b[0]\n"
"sdot z25.s, z28.b, z4.b[0]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1216,21 +1216,21 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 54f\n"
"ld1b { z29.b }, p5/Z, [x10]\n"
"ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z29.b, z0.b[1]\n"
"sdot z12.s, z29.b, z1.b[1]\n"
"sdot z16.s, z29.b, z2.b[1]\n"
"sdot z20.s, z29.b, z3.b[1]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z29.b, z4.b[1]\n"
- "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z28.b, z0.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z28.b, z1.b[1]\n"
"sdot z17.s, z28.b, z2.b[1]\n"
"sdot z21.s, z28.b, z3.b[1]\n"
"sdot z25.s, z28.b, z4.b[1]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z29.b, z0.b[1]\n"
"addvl x10, x10, #4\n"
+ "sdot z10.s, z29.b, z0.b[1]\n"
"sdot z14.s, z29.b, z1.b[1]\n"
"sdot z18.s, z29.b, z2.b[1]\n"
"sdot z22.s, z29.b, z3.b[1]\n"
@@ -1243,21 +1243,21 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 54f\n"
"ld1b { z29.b }, p5/Z, [x10]\n"
"ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z29.b, z0.b[2]\n"
"sdot z12.s, z29.b, z1.b[2]\n"
"sdot z16.s, z29.b, z2.b[2]\n"
"sdot z20.s, z29.b, z3.b[2]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z29.b, z4.b[2]\n"
- "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z28.b, z0.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z28.b, z1.b[2]\n"
"sdot z17.s, z28.b, z2.b[2]\n"
"sdot z21.s, z28.b, z3.b[2]\n"
"sdot z25.s, z28.b, z4.b[2]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z29.b, z0.b[2]\n"
"addvl x10, x10, #4\n"
+ "sdot z10.s, z29.b, z0.b[2]\n"
"sdot z14.s, z29.b, z1.b[2]\n"
"sdot z18.s, z29.b, z2.b[2]\n"
"sdot z22.s, z29.b, z3.b[2]\n"
@@ -1275,8 +1275,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z16.s, z29.b, z2.b[3]\n"
"sdot z20.s, z29.b, z3.b[3]\n"
"sdot z24.s, z29.b, z4.b[3]\n"
- "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z28.b, z0.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z28.b, z1.b[3]\n"
"sdot z17.s, z28.b, z2.b[3]\n"
"sdot z21.s, z28.b, z3.b[3]\n"
@@ -1299,15 +1299,15 @@ void sve_hybrid_s8s32_dot_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z12.s }, p4, [x23]\n"
"st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
"st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
@@ -1331,12 +1331,11 @@ void sve_hybrid_s8s32_dot_6x4VL (
"b 68f\n"
"56:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"57:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1348,17 +1347,17 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1407,8 +1406,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov x28, #0x0\n"
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1439,29 +1438,29 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z7.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z6.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z5.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z2.b }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z1.b, z7.b[0]\n"
"sdot z12.s, z1.b, z6.b[0]\n"
- "add x21, x21, #0x10\n"
"sdot z16.s, z1.b, z5.b[0]\n"
"sdot z20.s, z1.b, z4.b[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"sdot z24.s, z1.b, z3.b[0]\n"
"sdot z28.s, z1.b, z2.b[0]\n"
"ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"sdot z9.s, z0.b, z7.b[0]\n"
"sdot z13.s, z0.b, z6.b[0]\n"
"sdot z17.s, z0.b, z5.b[0]\n"
@@ -1569,24 +1568,24 @@ void sve_hybrid_s8s32_dot_6x4VL (
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
"sdot z8.s, z7.b, z0.b[0]\n"
"sdot z12.s, z7.b, z1.b[0]\n"
- "sdot z9.s, z6.b, z0.b[0]\n"
- "sdot z13.s, z6.b, z1.b[0]\n"
"sdot z16.s, z7.b, z2.b[0]\n"
"sdot z20.s, z7.b, z3.b[0]\n"
"sdot z24.s, z7.b, z4.b[0]\n"
"sdot z28.s, z7.b, z5.b[0]\n"
"ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[0]\n"
+ "sdot z13.s, z6.b, z1.b[0]\n"
"sdot z17.s, z6.b, z2.b[0]\n"
"sdot z21.s, z6.b, z3.b[0]\n"
"sdot z25.s, z6.b, z4.b[0]\n"
@@ -1608,23 +1607,23 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 65f\n"
"ld1b { z7.b }, p5/Z, [x10]\n"
"ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z7.b, z0.b[1]\n"
"sdot z12.s, z7.b, z1.b[1]\n"
"sdot z16.s, z7.b, z2.b[1]\n"
"sdot z20.s, z7.b, z3.b[1]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z7.b, z4.b[1]\n"
"sdot z28.s, z7.b, z5.b[1]\n"
- "sdot z9.s, z6.b, z0.b[1]\n"
"ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[1]\n"
"sdot z13.s, z6.b, z1.b[1]\n"
"sdot z17.s, z6.b, z2.b[1]\n"
"sdot z21.s, z6.b, z3.b[1]\n"
"sdot z25.s, z6.b, z4.b[1]\n"
"sdot z29.s, z6.b, z5.b[1]\n"
"ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z7.b, z0.b[1]\n"
"addvl x10, x10, #4\n"
+ "sdot z10.s, z7.b, z0.b[1]\n"
"sdot z14.s, z7.b, z1.b[1]\n"
"sdot z18.s, z7.b, z2.b[1]\n"
"sdot z22.s, z7.b, z3.b[1]\n"
@@ -1639,23 +1638,23 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ble 65f\n"
"ld1b { z7.b }, p5/Z, [x10]\n"
"ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"sdot z8.s, z7.b, z0.b[2]\n"
"sdot z12.s, z7.b, z1.b[2]\n"
"sdot z16.s, z7.b, z2.b[2]\n"
"sdot z20.s, z7.b, z3.b[2]\n"
+ "subs x27, x27, #0x4\n"
"sdot z24.s, z7.b, z4.b[2]\n"
"sdot z28.s, z7.b, z5.b[2]\n"
- "sdot z9.s, z6.b, z0.b[2]\n"
"ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[2]\n"
"sdot z13.s, z6.b, z1.b[2]\n"
"sdot z17.s, z6.b, z2.b[2]\n"
"sdot z21.s, z6.b, z3.b[2]\n"
"sdot z25.s, z6.b, z4.b[2]\n"
"sdot z29.s, z6.b, z5.b[2]\n"
"ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z7.b, z0.b[2]\n"
"addvl x10, x10, #4\n"
+ "sdot z10.s, z7.b, z0.b[2]\n"
"sdot z14.s, z7.b, z1.b[2]\n"
"sdot z18.s, z7.b, z2.b[2]\n"
"sdot z22.s, z7.b, z3.b[2]\n"
@@ -1703,17 +1702,17 @@ void sve_hybrid_s8s32_dot_6x4VL (
"cmp x28, x20\n"
"bne 60b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x21, x22, x20, LSL #2\n"
"st1w { z12.s }, p4, [x24]\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
"st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
"st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
@@ -1749,8 +1748,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index fceaeb119c..8135172b54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -70,7 +70,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
index d257cc69de..abe4b92faf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
@@ -44,18 +44,18 @@ void sve_hybrid_s8s32_mmla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -89,7 +89,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"beq 12f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -100,14 +100,14 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"incw x20\n"
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 3f\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z19.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "zip1 z9.d, z19.d, z13.d\n"
- "zip2 z13.d, z19.d, z13.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
"zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
@@ -126,8 +126,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"mov x28, #0x0\n"
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -143,87 +143,87 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z16.b }, p5/Z, [x10]\n"
- "ld1b { z17.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "trn1 z18.d, z19.d, z20.d\n"
- "trn2 z19.d, z19.d, z20.d\n"
- ".inst 0x45109a48 // smmla z8.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45119a4c // smmla z12.s, z18.b, z17.b\n"
- "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45109a49 // smmla z9.s, z18.b, z16.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45149a4d // smmla z13.s, z18.b, z20.b\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45019a4a // smmla z10.s, z18.b, z1.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45109a68 // smmla z8.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45119a6c // smmla z12.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45109a69 // smmla z9.s, z19.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45119a6d // smmla z13.s, z19.b, z17.b\n"
- "ld1b { z3.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45109a6a // smmla z10.s, z19.b, z16.b\n"
+ "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45109a8a // smmla z10.s, z20.b, z16.b\n"
+ ".inst 0x45079a8e // smmla z14.s, z20.b, z7.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45039a6e // smmla z14.s, z19.b, z3.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45119a6b // smmla z11.s, z19.b, z17.b\n"
- ".inst 0x45109a6f // smmla z15.s, z19.b, z16.b\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
+ "add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "subs x27, x27, #0x8\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -231,9 +231,9 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"bne 5b\n"
"uzp1 z8.d, z8.d, z12.d\n"
"uzp1 z9.d, z9.d, z13.d\n"
+ "st1w { z8.s }, p4, [x9]\n"
"uzp1 z10.d, z10.d, z14.d\n"
"uzp1 z11.d, z11.d, z15.d\n"
- "st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
@@ -246,7 +246,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"12:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"13:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -258,19 +258,19 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z18.s }, p4/Z, [x9]\n"
- "ld1w { z24.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z18.d, z12.d\n"
+ "zip2 z12.d, z18.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z2.d, z13.d\n"
+ "zip2 z13.d, z2.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z8.d, z18.d, z12.d\n"
- "zip2 z12.d, z18.d, z12.d\n"
- "zip1 z9.d, z24.d, z13.d\n"
- "zip2 z13.d, z24.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
"zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
@@ -289,8 +289,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"mov x28, #0x0\n"
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -309,109 +309,109 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "trn1 z18.d, z19.d, z25.d\n"
- "trn2 z19.d, z19.d, z25.d\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45119a68 // smmla z8.s, z19.b, z17.b\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45109a6c // smmla z12.s, z19.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45119a69 // smmla z9.s, z19.b, z17.b\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45109a6d // smmla z13.s, z19.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45119a6a // smmla z10.s, z19.b, z17.b\n"
+ ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n"
+ ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45109a6e // smmla z14.s, z19.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45119a6b // smmla z11.s, z19.b, z17.b\n"
- ".inst 0x45109a6f // smmla z15.s, z19.b, z16.b\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"ld1rqb { z19.b }, p0/Z, [x25]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "subs x27, x27, #0x8\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"ble 21f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"21:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z17.d, z8.d, z12.d\n"
+ "add x20, x9, x20, LSL #2\n"
+ "uzp1 z16.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z16.d, z9.d, z13.d\n"
+ "uzp1 z17.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z12.d, z10.d, z14.d\n"
+ "st1w { z16.s }, p4, [x9]\n"
+ "uzp1 z16.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x20, x9, x20, LSL #2\n"
- "uzp1 z26.d, z11.d, z15.d\n"
+ "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z2.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z17.s }, p4, [x9]\n"
- "st1w { z16.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z12.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z26.s }, p1, [x9, #3, MUL VL]\n"
+ "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z8.s }, p4, [x20]\n"
"st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
@@ -425,7 +425,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"23:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"24:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -437,28 +437,28 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z24.s }, p4/Z, [x9]\n"
- "ld1w { z26.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x21, x9, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
- "zip1 z8.d, z24.d, z12.d\n"
- "zip2 z12.d, z24.d, z12.d\n"
- "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z9.d, z26.d, z13.d\n"
- "zip2 z13.d, z26.d, z13.d\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -489,8 +489,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"mov x28, #0x0\n"
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -512,92 +512,92 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z27.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
"ld1rqb { z24.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z26.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "trn1 z6.d, z27.d, z24.d\n"
- "trn2 z27.d, z27.d, z24.d\n"
- "trn1 z30.d, z26.d, z29.d\n"
- "trn2 z26.d, z26.d, z29.d\n"
- ".inst 0x451998c8 // smmla z8.s, z6.b, z25.b\n"
- ".inst 0x451c98cc // smmla z12.s, z6.b, z28.b\n"
- ".inst 0x45199bd0 // smmla z16.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451c9bd4 // smmla z20.s, z30.b, z28.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x451998c9 // smmla z9.s, z6.b, z25.b\n"
- ".inst 0x45199bd1 // smmla z17.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x451898cd // smmla z13.s, z6.b, z24.b\n"
- ".inst 0x45189bd5 // smmla z21.s, z30.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x451998ca // smmla z10.s, z6.b, z25.b\n"
- ".inst 0x45199bd2 // smmla z18.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x451898ce // smmla z14.s, z6.b, z24.b\n"
- ".inst 0x45189bd6 // smmla z22.s, z30.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x451998cb // smmla z11.s, z6.b, z25.b\n"
- ".inst 0x45199bd3 // smmla z19.s, z30.b, z25.b\n"
- ".inst 0x451898cf // smmla z15.s, z6.b, z24.b\n"
- ".inst 0x45189bd7 // smmla z23.s, z30.b, z24.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "cmp x27, #0x10\n"
".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
+ ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
+ ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
+ ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
+ ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
+ ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
+ ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
+ ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
+ ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"ld1rqb { z24.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
"trn1 z27.d, z1.d, z24.d\n"
"trn2 z1.d, z1.d, z24.d\n"
- "trn1 z26.d, z3.d, z29.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
- ".inst 0x451c9b6c // smmla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z29.d\n"
".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451c9b54 // smmla z20.s, z26.b, z28.b\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
@@ -614,9 +614,9 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
@@ -641,26 +641,26 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z27.d, z8.d, z12.d\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "uzp1 z25.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z26.d, z9.d, z13.d\n"
+ "uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z27.s }, p4, [x9]\n"
"uzp1 z16.d, z16.d, z20.d\n"
- "uzp1 z17.d, z17.d, z21.d\n"
- "st1w { z26.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z18.d, z18.d, z22.d\n"
- "uzp1 z19.d, z19.d, z23.d\n"
- "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
+ "uzp1 z17.d, z17.d, z21.d\n"
+ "uzp1 z18.d, z18.d, z22.d\n"
"st1w { z8.s }, p4, [x21]\n"
+ "uzp1 z19.d, z19.d, z23.d\n"
"st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
@@ -676,7 +676,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"34:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"35:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -688,37 +688,37 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x20]\n"
- "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -745,8 +745,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"mov x28, #0x0\n"
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -771,114 +771,114 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z31.b }, p5/Z, [x10]\n"
- "ld1b { z30.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z29.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
"ld1rqb { z28.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "trn1 z27.d, z29.d, z25.d\n"
- "trn2 z29.d, z29.d, z25.d\n"
- "trn1 z26.d, z28.d, z24.d\n"
- "trn2 z28.d, z28.d, z24.d\n"
- ".inst 0x451f9b68 // smmla z8.s, z27.b, z31.b\n"
- ".inst 0x451e9b6c // smmla z12.s, z27.b, z30.b\n"
- ".inst 0x451f9b50 // smmla z16.s, z26.b, z31.b\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451e9b54 // smmla z20.s, z26.b, z30.b\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
- ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
- ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
- ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "trn1 z27.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- "trn1 z26.d, z3.d, z24.d\n"
- ".inst 0x451d9b68 // smmla z8.s, z27.b, z29.b\n"
- ".inst 0x451c9b6c // smmla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z24.d\n"
- ".inst 0x451d9b50 // smmla z16.s, z26.b, z29.b\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199b88 // smmla z8.s, z28.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b8c // smmla z12.s, z28.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x451c9b54 // smmla z20.s, z26.b, z28.b\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199b89 // smmla z9.s, z28.b, z25.b\n"
".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ "subs x27, x27, #0x8\n"
+ ".inst 0x45189b8d // smmla z13.s, z28.b, z24.b\n"
".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x45199b8a // smmla z10.s, z28.b, z25.b\n"
".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ ".inst 0x45189b8e // smmla z14.s, z28.b, z24.b\n"
".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ ".inst 0x45199b8b // smmla z11.s, z28.b, z25.b\n"
".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
- ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ ".inst 0x45189b8f // smmla z15.s, z28.b, z24.b\n"
".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 43f\n"
"ld1b { z25.b }, p5/Z, [x10]\n"
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
@@ -903,33 +903,33 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp1 z25.d, z8.d, z12.d\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z27.d, z10.d, z14.d\n"
+ "uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 z26.d, z11.d, z15.d\n"
+ "uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z25.s }, p4, [x9]\n"
"uzp1 z25.d, z16.d, z20.d\n"
+ "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp1 z24.d, z17.d, z21.d\n"
+ "st1w { z8.s }, p4, [x22]\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z27.s }, p2, [x9, #2, MUL VL]\n"
"uzp1 z21.d, z18.d, z22.d\n"
+ "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z26.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
"uzp1 z20.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x22]\n"
- "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
+ "uzp2 z19.d, z19.d, z23.d\n"
"st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
"st1w { z25.s }, p4, [x21]\n"
"st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
@@ -947,7 +947,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"45:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"46:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -959,46 +959,46 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x21]\n"
- "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x20]\n"
- "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
@@ -1037,8 +1037,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1066,103 +1066,102 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x25]\n"
"ld1rqb { z7.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
"trn1 z3.d, z7.d, z2.d\n"
"trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45019888 // smmla z8.s, z4.b, z1.b\n"
+ ".inst 0x450198a8 // smmla z8.s, z5.b, z1.b\n"
".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n"
".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4500988c // smmla z12.s, z4.b, z0.b\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x450098ac // smmla z12.s, z5.b, z0.b\n"
".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45019889 // smmla z9.s, z4.b, z1.b\n"
+ ".inst 0x450198a9 // smmla z9.s, z5.b, z1.b\n"
+ "add x25, x25, #0x10\n"
".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n"
".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4500988d // smmla z13.s, z4.b, z0.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x450098ad // smmla z13.s, z5.b, z0.b\n"
".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4501988a // smmla z10.s, z4.b, z1.b\n"
+ ".inst 0x450198aa // smmla z10.s, z5.b, z1.b\n"
".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n"
".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4500988e // smmla z14.s, z4.b, z0.b\n"
+ ".inst 0x450098ae // smmla z14.s, z5.b, z0.b\n"
".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x4501988b // smmla z11.s, z4.b, z1.b\n"
+ ".inst 0x450198ab // smmla z11.s, z5.b, z1.b\n"
".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n"
".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
- ".inst 0x4500988f // smmla z15.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x450098af // smmla z15.s, z5.b, z0.b\n"
".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n"
- ".inst 0x450198b8 // smmla z24.s, z5.b, z1.b\n"
+ ".inst 0x45019898 // smmla z24.s, z4.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n"
- ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n"
- ".inst 0x450198b9 // smmla z25.s, z5.b, z1.b\n"
+ ".inst 0x45019899 // smmla z25.s, z4.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n"
- ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n"
- ".inst 0x450198ba // smmla z26.s, z5.b, z1.b\n"
+ ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n"
- ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n"
- ".inst 0x450198bb // smmla z27.s, z5.b, z1.b\n"
+ ".inst 0x4501989b // smmla z27.s, z4.b, z1.b\n"
".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n"
- ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
@@ -1170,6 +1169,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
"ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
@@ -1190,8 +1190,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
+ "addvl x10, x10, #8\n"
".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
@@ -1203,24 +1203,24 @@ void sve_hybrid_s8s32_mmla_6x4VL (
".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
@@ -1237,39 +1237,39 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z1.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z0.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z3.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "uzp1 z2.d, z8.d, z12.d\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "uzp2 z8.d, z8.d, z12.d\n"
+ "uzp1 z1.d, z9.d, z13.d\n"
+ "uzp2 z9.d, z9.d, z13.d\n"
+ "uzp1 z0.d, z10.d, z14.d\n"
+ "st1w { z2.s }, p4, [x9]\n"
+ "uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z2.d, z11.d, z15.d\n"
+ "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z1.s }, p4, [x9]\n"
- "add x20, x21, x20, LSL #2\n"
"uzp1 z1.d, z16.d, z20.d\n"
+ "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z0.s }, p3, [x9, #1, MUL VL]\n"
"uzp1 z0.d, z17.d, z21.d\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z3.s }, p2, [x9, #2, MUL VL]\n"
"uzp1 z21.d, z18.d, z22.d\n"
+ "st1w { z8.s }, p4, [x23]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
"uzp1 z20.d, z19.d, z23.d\n"
+ "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x23]\n"
"uzp1 z24.d, z24.d, z28.d\n"
+ "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
"uzp1 z25.d, z25.d, z29.d\n"
- "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
"uzp1 z26.d, z26.d, z30.d\n"
- "uzp1 z27.d, z27.d, z31.d\n"
- "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+ "uzp1 z27.d, z27.d, z31.d\n"
"st1w { z1.s }, p4, [x22]\n"
"st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
"st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
@@ -1289,12 +1289,11 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"b 68f\n"
"56:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"57:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1306,54 +1305,54 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z28.s }, p4/Z, [x20]\n"
- "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
@@ -1389,8 +1388,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"mov x28, #0x0\n"
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1421,113 +1420,113 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z7.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z0.b }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "add x21, x21, #0x10\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45019888 // smmla z8.s, z4.b, z1.b\n"
- ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n"
+ ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
+ ".inst 0x45019890 // smmla z16.s, z4.b, z1.b\n"
".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4500988c // smmla z12.s, z4.b, z0.b\n"
- ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
+ ".inst 0x45009894 // smmla z20.s, z4.b, z0.b\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45019889 // smmla z9.s, z4.b, z1.b\n"
- ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n"
+ ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45019891 // smmla z17.s, z4.b, z1.b\n"
".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4500988d // smmla z13.s, z4.b, z0.b\n"
- ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
+ ".inst 0x45009895 // smmla z21.s, z4.b, z0.b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4501988a // smmla z10.s, z4.b, z1.b\n"
- ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n"
+ ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x45019892 // smmla z18.s, z4.b, z1.b\n"
".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4500988e // smmla z14.s, z4.b, z0.b\n"
- ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
+ ".inst 0x45009896 // smmla z22.s, z4.b, z0.b\n"
".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x4501988b // smmla z11.s, z4.b, z1.b\n"
- ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n"
+ ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
+ ".inst 0x45019893 // smmla z19.s, z4.b, z1.b\n"
".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
- ".inst 0x4500988f // smmla z15.s, z4.b, z0.b\n"
- ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
- ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
+ ".inst 0x45009897 // smmla z23.s, z4.b, z0.b\n"
+ ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
- ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n"
- ".inst 0x450198b8 // smmla z24.s, z5.b, z1.b\n"
+ ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
+ ".inst 0x450198b0 // smmla z16.s, z5.b, z1.b\n"
+ ".inst 0x45019878 // smmla z24.s, z3.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
- ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n"
- ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098b4 // smmla z20.s, z5.b, z0.b\n"
+ ".inst 0x4500987c // smmla z28.s, z3.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
- ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n"
- ".inst 0x450198b9 // smmla z25.s, z5.b, z1.b\n"
+ ".inst 0x450198e9 // smmla z9.s, z7.b, z1.b\n"
+ ".inst 0x450198b1 // smmla z17.s, z5.b, z1.b\n"
+ ".inst 0x45019879 // smmla z25.s, z3.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
- ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n"
- ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098b5 // smmla z21.s, z5.b, z0.b\n"
+ ".inst 0x4500987d // smmla z29.s, z3.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
- ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n"
- ".inst 0x450198ba // smmla z26.s, z5.b, z1.b\n"
+ ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n"
+ ".inst 0x450198b2 // smmla z18.s, z5.b, z1.b\n"
+ ".inst 0x4501987a // smmla z26.s, z3.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
- ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n"
- ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098b6 // smmla z22.s, z5.b, z0.b\n"
+ ".inst 0x4500987e // smmla z30.s, z3.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
- ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n"
- ".inst 0x450198bb // smmla z27.s, z5.b, z1.b\n"
- ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
- ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n"
- ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
+ ".inst 0x450198eb // smmla z11.s, z7.b, z1.b\n"
+ ".inst 0x450198b3 // smmla z19.s, z5.b, z1.b\n"
+ ".inst 0x4501987b // smmla z27.s, z3.b, z1.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n"
+ ".inst 0x4500987f // smmla z31.s, z3.b, z0.b\n"
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
"ld1rqb { z0.b }, p0/Z, [x21]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
"ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
@@ -1548,8 +1547,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
+ "addvl x10, x10, #8\n"
".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
@@ -1561,24 +1560,24 @@ void sve_hybrid_s8s32_mmla_6x4VL (
".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
@@ -1595,46 +1594,46 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 60b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 z0.d, z8.d, z12.d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
+ "st1w { z0.s }, p4, [x9]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
+ "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z0.s }, p4, [x9]\n"
- "add x21, x22, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
+ "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
+ "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"uzp1 z21.d, z18.d, z22.d\n"
+ "st1w { z8.s }, p4, [x24]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
"uzp1 z22.d, z19.d, z23.d\n"
+ "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x24]\n"
"uzp1 z23.d, z24.d, z28.d\n"
+ "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
"uzp2 z24.d, z24.d, z28.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
"uzp1 z28.d, z25.d, z29.d\n"
+ "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
"uzp2 z25.d, z25.d, z29.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
"uzp1 z29.d, z26.d, z30.d\n"
+ "st1w { z15.s }, p4, [x23]\n"
"uzp2 z26.d, z26.d, z30.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
"uzp1 z30.d, z27.d, z31.d\n"
- "uzp2 z27.d, z27.d, z31.d\n"
- "st1w { z15.s }, p4, [x23]\n"
"st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+ "uzp2 z27.d, z27.d, z31.d\n"
"st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
"st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
"st1w { z16.s }, p4, [x22]\n"
@@ -1665,8 +1664,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index 8d508f94f0..839ff6f0af 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 4, 4, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 7b598bac57..85e933fd46 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -45,18 +45,18 @@ void sve_hybrid_u8qa_dot_4x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -91,20 +91,20 @@ void sve_hybrid_u8qa_dot_4x4VL (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"3:" // Height 1: setup done
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -120,41 +120,41 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ble 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z21.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z23.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z22.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "udot z16.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- "udot z17.s, z26.b, z0.b[0]\n"
- "udot z18.s, z25.b, z0.b[0]\n"
- "udot z19.s, z24.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "udot z16.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z17.s, z21.b, z0.b[0]\n"
+ "udot z18.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
"udot z16.s, z20.b, z0.b[1]\n"
- "ld1b { z20.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "udot z17.s, z23.b, z0.b[1]\n"
- "ld1b { z23.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "udot z18.s, z22.b, z0.b[1]\n"
- "ld1b { z22.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "udot z19.s, z21.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "udot z17.s, z21.b, z0.b[1]\n"
+ "udot z18.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
+ "udot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "udot z16.s, z22.b, z0.b[2]\n"
+ "udot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "udot z18.s, z21.b, z0.b[2]\n"
+ "udot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
"ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "udot z16.s, z20.b, z0.b[2]\n"
+ "udot z16.s, z22.b, z0.b[3]\n"
+ "udot z17.s, z20.b, z0.b[3]\n"
"ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "udot z17.s, z26.b, z0.b[2]\n"
- "udot z18.s, z25.b, z0.b[2]\n"
- "udot z19.s, z24.b, z0.b[2]\n"
- "udot z16.s, z23.b, z0.b[3]\n"
- "udot z17.s, z22.b, z0.b[3]\n"
"udot z18.s, z21.b, z0.b[3]\n"
"udot z19.s, z20.b, z0.b[3]\n"
+ "add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"udot z11.s, z0.b, z15.b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
@@ -163,49 +163,49 @@ void sve_hybrid_u8qa_dot_4x4VL (
"bgt 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1b { z22.b }, p2/Z, [x28]\n"
"subs x25, x25, #0x4\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z16.s, z22.b, z0.b[0]\n"
+ "udot z17.s, z20.b, z0.b[0]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "udot z16.s, z23.b, z0.b[0]\n"
- "udot z17.s, z22.b, z0.b[0]\n"
"udot z18.s, z21.b, z0.b[0]\n"
"udot z19.s, z20.b, z0.b[0]\n"
+ "addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
"ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z20.b, z0.b[1]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z23.b, z0.b[1]\n"
"udot z17.s, z22.b, z0.b[1]\n"
"udot z18.s, z21.b, z0.b[1]\n"
"udot z19.s, z20.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
"ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z20.b, z0.b[2]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z23.b, z0.b[2]\n"
"udot z17.s, z22.b, z0.b[2]\n"
"udot z18.s, z21.b, z0.b[2]\n"
"udot z19.s, z20.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z21.b, z0.b[3]\n"
+ "udot z17.s, z20.b, z0.b[3]\n"
"ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z23.b, z0.b[3]\n"
- "udot z17.s, z22.b, z0.b[3]\n"
"udot z18.s, z21.b, z0.b[3]\n"
"udot z19.s, z20.b, z0.b[3]\n"
+ "addvl x28, x28, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -215,35 +215,35 @@ void sve_hybrid_u8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 4b\n"
"tbnz %x[flags], #31, 12f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z20.s, p2/M, z20.s\n"
"uaddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
+ "neg z20.s, p2/M, z20.s\n"
"mul z11.s, p2/M, z11.s, z20.s\n"
"12:" // Height 1: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
"ld1w { z23.s }, p2/Z, [x10]\n"
- "ld1w { z20.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z22.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z16.s, z16.s, z23.s\n"
- "add z17.s, z17.s, z20.s\n"
+ "add z17.s, z17.s, z22.s\n"
+ "add z18.s, z18.s, z21.s\n"
+ "add z19.s, z19.s, z20.s\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z22.s\n"
- "add z19.s, z19.s, z21.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n"
+ "addvl x10, x10, #4\n"
".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
"tbz %x[flags], #5, 13f\n"
@@ -261,19 +261,19 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z19.s, z19.s, z20.s\n"
"13:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z22.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z20.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z20.s\n"
+ "add z18.s, z18.s, z20.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
"ld1rw { z21.s }, p2/Z, [x20]\n"
- "add z16.s, z16.s, z22.s\n"
+ "add z19.s, z19.s, z20.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z22.s\n"
- "add z18.s, z18.s, z22.s\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z22.s\n"
"smin z16.s, p2/M, z16.s, z21.s\n"
"smin z17.s, p2/M, z17.s, z21.s\n"
"smin z18.s, p2/M, z18.s, z21.s\n"
@@ -281,8 +281,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z20.s\n"
"smax z17.s, p2/M, z17.s, z20.s\n"
"smax z18.s, p2/M, z18.s, z20.s\n"
- "smax z19.s, p2/M, z19.s, z20.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "smax z19.s, p2/M, z19.s, z20.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
"st1b { z16.b }, p1, [x27]\n"
@@ -300,24 +300,24 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"17:" // Height 2: setup done
"mov x26, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -336,45 +336,45 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ble 23f\n"
"21:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z25.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[0]\n"
+ "udot z20.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z26.b, z0.b[0]\n"
+ "udot z21.s, z26.b, z1.b[0]\n"
+ "udot z18.s, z24.b, z0.b[0]\n"
+ "udot z22.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "udot z19.s, z25.b, z0.b[0]\n"
+ "udot z23.s, z25.b, z1.b[0]\n"
"ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
- "udot z16.s, z25.b, z0.b[0]\n"
- "udot z20.s, z25.b, z1.b[0]\n"
"ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "udot z17.s, z30.b, z0.b[0]\n"
- "udot z21.s, z30.b, z1.b[0]\n"
- "udot z18.s, z29.b, z0.b[0]\n"
- "udot z22.s, z29.b, z1.b[0]\n"
- "udot z19.s, z28.b, z0.b[0]\n"
- "udot z23.s, z28.b, z1.b[0]\n"
"udot z16.s, z24.b, z0.b[1]\n"
"udot z20.s, z24.b, z1.b[1]\n"
"ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
"udot z17.s, z27.b, z0.b[1]\n"
"udot z21.s, z27.b, z1.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
"ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
"udot z18.s, z26.b, z0.b[1]\n"
"udot z22.s, z26.b, z1.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
"ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z19.s, z25.b, z0.b[1]\n"
"udot z23.s, z25.b, z1.b[1]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
"ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
"udot z16.s, z24.b, z0.b[2]\n"
"udot z20.s, z24.b, z1.b[2]\n"
"ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "add x23, x23, #0x10\n"
"udot z17.s, z30.b, z0.b[2]\n"
"udot z21.s, z30.b, z1.b[2]\n"
"udot z18.s, z29.b, z0.b[2]\n"
@@ -398,34 +398,34 @@ void sve_hybrid_u8qa_dot_4x4VL (
"bgt 21b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x25, x25, #0x4\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[0]\n"
+ "udot z20.s, z24.b, z1.b[0]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "udot z16.s, z27.b, z0.b[0]\n"
- "udot z20.s, z27.b, z1.b[0]\n"
"udot z17.s, z26.b, z0.b[0]\n"
"udot z21.s, z26.b, z1.b[0]\n"
"udot z18.s, z25.b, z0.b[0]\n"
"udot z22.s, z25.b, z1.b[0]\n"
+ "addvl x28, x28, #4\n"
"udot z19.s, z24.b, z0.b[0]\n"
"udot z23.s, z24.b, z1.b[0]\n"
"ble 24f\n"
"ld1b { z27.b }, p2/Z, [x28]\n"
"ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z27.b, z0.b[1]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z27.b, z0.b[1]\n"
"udot z20.s, z27.b, z1.b[1]\n"
"udot z17.s, z26.b, z0.b[1]\n"
"udot z21.s, z26.b, z1.b[1]\n"
"udot z18.s, z25.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
"udot z22.s, z25.b, z1.b[1]\n"
"udot z19.s, z24.b, z0.b[1]\n"
"udot z23.s, z24.b, z1.b[1]\n"
@@ -433,29 +433,29 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1b { z27.b }, p2/Z, [x28]\n"
"ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z27.b, z0.b[2]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z27.b, z0.b[2]\n"
"udot z20.s, z27.b, z1.b[2]\n"
"udot z17.s, z26.b, z0.b[2]\n"
"udot z21.s, z26.b, z1.b[2]\n"
"udot z18.s, z25.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
"udot z22.s, z25.b, z1.b[2]\n"
"udot z19.s, z24.b, z0.b[2]\n"
"udot z23.s, z24.b, z1.b[2]\n"
"ble 24f\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
"ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[3]\n"
+ "udot z20.s, z24.b, z1.b[3]\n"
"ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z27.b, z0.b[3]\n"
- "udot z20.s, z27.b, z1.b[3]\n"
"udot z17.s, z26.b, z0.b[3]\n"
"udot z21.s, z26.b, z1.b[3]\n"
"udot z18.s, z25.b, z0.b[3]\n"
"udot z22.s, z25.b, z1.b[3]\n"
+ "addvl x28, x28, #4\n"
"udot z19.s, z24.b, z0.b[3]\n"
"udot z23.s, z24.b, z1.b[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
@@ -468,18 +468,18 @@ void sve_hybrid_u8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
+ "add x23, x27, x20\n"
"tbnz %x[flags], #31, 26f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z24.s, p2/M, z24.s\n"
"uaddv d11, p0, z11.s\n"
- "uaddv d12, p0, z12.s\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z24.s\n"
+ "uaddv d12, p0, z12.s\n"
+ "neg z24.s, p2/M, z24.s\n"
"mov z12.s, z12.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z24.s\n"
"mul z12.s, p2/M, z12.s, z24.s\n"
"26:" // Height 2: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
@@ -518,24 +518,24 @@ void sve_hybrid_u8qa_dot_4x4VL (
".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
"and z24.d, z16.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z24.s\n"
"and z30.d, z17.d, z0.d\n"
"and z29.d, z18.d, z0.d\n"
"and z28.d, z19.d, z0.d\n"
"and z27.d, z20.d, z0.d\n"
"and z26.d, z21.d, z0.d\n"
- "asr z24.s, z24.s, #0x1f\n"
"and z25.d, z22.d, z0.d\n"
+ "and z24.d, z23.d, z0.d\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"asr z27.s, z27.s, #0x1f\n"
- "sqadd z16.s, z16.s, z24.s\n"
- "and z24.d, z23.d, z0.d\n"
"asr z26.s, z26.s, #0x1f\n"
"asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
"sqadd z17.s, z17.s, z30.s\n"
"sqadd z18.s, z18.s, z29.s\n"
- "asr z24.s, z24.s, #0x1f\n"
"sqadd z19.s, z19.s, z28.s\n"
"sqadd z20.s, z20.s, z27.s\n"
"sqadd z21.s, z21.s, z26.s\n"
@@ -543,27 +543,27 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z23.s, z23.s, z24.s\n"
"27:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add z20.s, z20.s, z24.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z26.s\n"
+ "add z21.s, z21.s, z24.s\n"
+ "add z22.s, z22.s, z24.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z26.s\n"
- "add z18.s, z18.s, z26.s\n"
"ld1rw { z25.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z26.s\n"
- "add z20.s, z20.s, z26.s\n"
+ "add z23.s, z23.s, z24.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z21.s, z21.s, z26.s\n"
- "add z22.s, z22.s, z26.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z26.s\n"
"smin z16.s, p2/M, z16.s, z25.s\n"
"smin z17.s, p2/M, z17.s, z25.s\n"
"smin z18.s, p2/M, z18.s, z25.s\n"
@@ -575,20 +575,20 @@ void sve_hybrid_u8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z24.s\n"
"smax z17.s, p2/M, z17.s, z24.s\n"
"smax z18.s, p2/M, z18.s, z24.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"smax z19.s, p2/M, z19.s, z24.s\n"
"smax z20.s, p2/M, z20.s, z24.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z21.s, p2/M, z21.s, z24.s\n"
"smax z22.s, p2/M, z22.s, z24.s\n"
- "smax z23.s, p2/M, z23.s, z24.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z18.h, z18.h, z19.h\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z17.h, z22.h, z23.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z20.b, z20.b, z17.b\n"
"st1b { z16.b }, p1, [x27]\n"
+ "smax z23.s, p2/M, z23.s, z24.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "st1b { z20.b }, p1, [x23]\n"
"addvl x27, x27, #1\n"
- "st1b { z20.b }, p1, [x24]\n"
"28:" // Height 2: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -603,16 +603,16 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -623,8 +623,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov x26, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -646,57 +646,57 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ble 37f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "udot z16.s, z28.b, z0.b[0]\n"
+ "udot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z28.b, z2.b[0]\n"
+ "udot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z21.s, z30.b, z1.b[0]\n"
+ "udot z25.s, z30.b, z2.b[0]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "udot z18.s, z29.b, z0.b[0]\n"
+ "udot z22.s, z29.b, z1.b[0]\n"
+ "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x22, x22, #0x10\n"
- "udot z16.s, z5.b, z0.b[0]\n"
- "udot z20.s, z5.b, z1.b[0]\n"
- "udot z17.s, z29.b, z0.b[0]\n"
- "udot z21.s, z29.b, z1.b[0]\n"
- "udot z18.s, z4.b, z0.b[0]\n"
- "udot z24.s, z5.b, z2.b[0]\n"
- "udot z25.s, z29.b, z2.b[0]\n"
+ "udot z26.s, z29.b, z2.b[0]\n"
+ "udot z19.s, z28.b, z0.b[0]\n"
"ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "udot z22.s, z4.b, z1.b[0]\n"
- "udot z26.s, z4.b, z2.b[0]\n"
- "udot z19.s, z28.b, z0.b[0]\n"
"udot z23.s, z28.b, z1.b[0]\n"
"udot z27.s, z28.b, z2.b[0]\n"
- "udot z16.s, z3.b, z0.b[1]\n"
"ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "udot z16.s, z3.b, z0.b[1]\n"
"udot z20.s, z3.b, z1.b[1]\n"
- "udot z24.s, z3.b, z2.b[1]\n"
"ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ "udot z24.s, z3.b, z2.b[1]\n"
"udot z17.s, z31.b, z0.b[1]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "add x22, x22, #0x10\n"
"udot z21.s, z31.b, z1.b[1]\n"
"udot z25.s, z31.b, z2.b[1]\n"
- "udot z18.s, z30.b, z0.b[1]\n"
"ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "udot z18.s, z30.b, z0.b[1]\n"
"udot z22.s, z30.b, z1.b[1]\n"
"udot z26.s, z30.b, z2.b[1]\n"
- "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z19.s, z29.b, z0.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z23.s, z29.b, z1.b[1]\n"
"udot z27.s, z29.b, z2.b[1]\n"
- "udot z16.s, z28.b, z0.b[2]\n"
"ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "udot z16.s, z28.b, z0.b[2]\n"
"udot z20.s, z28.b, z1.b[2]\n"
"udot z24.s, z28.b, z2.b[2]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
"udot z17.s, z5.b, z0.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
"udot z21.s, z5.b, z1.b[2]\n"
"udot z25.s, z5.b, z2.b[2]\n"
"udot z18.s, z4.b, z0.b[2]\n"
@@ -727,23 +727,23 @@ void sve_hybrid_u8qa_dot_4x4VL (
"bgt 35b\n"
"37:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
- "udot z16.s, z31.b, z0.b[0]\n"
- "udot z20.s, z31.b, z1.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "udot z16.s, z28.b, z0.b[0]\n"
+ "udot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z28.b, z2.b[0]\n"
"udot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z21.s, z30.b, z1.b[0]\n"
+ "udot z25.s, z30.b, z2.b[0]\n"
+ "addvl x28, x28, #4\n"
"udot z18.s, z29.b, z0.b[0]\n"
"udot z22.s, z29.b, z1.b[0]\n"
- "udot z24.s, z31.b, z2.b[0]\n"
- "udot z25.s, z30.b, z2.b[0]\n"
"udot z26.s, z29.b, z2.b[0]\n"
"udot z19.s, z28.b, z0.b[0]\n"
"udot z23.s, z28.b, z1.b[0]\n"
@@ -752,14 +752,14 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1b { z31.b }, p2/Z, [x28]\n"
"ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z31.b, z0.b[1]\n"
"ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z31.b, z0.b[1]\n"
"udot z20.s, z31.b, z1.b[1]\n"
"udot z24.s, z31.b, z2.b[1]\n"
"udot z17.s, z30.b, z0.b[1]\n"
"udot z21.s, z30.b, z1.b[1]\n"
+ "addvl x28, x28, #4\n"
"udot z25.s, z30.b, z2.b[1]\n"
"udot z18.s, z29.b, z0.b[1]\n"
"udot z22.s, z29.b, z1.b[1]\n"
@@ -771,14 +771,14 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1b { z31.b }, p2/Z, [x28]\n"
"ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z31.b, z0.b[2]\n"
"ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z31.b, z0.b[2]\n"
"udot z20.s, z31.b, z1.b[2]\n"
"udot z24.s, z31.b, z2.b[2]\n"
"udot z17.s, z30.b, z0.b[2]\n"
"udot z21.s, z30.b, z1.b[2]\n"
+ "addvl x28, x28, #4\n"
"udot z25.s, z30.b, z2.b[2]\n"
"udot z18.s, z29.b, z0.b[2]\n"
"udot z22.s, z29.b, z1.b[2]\n"
@@ -789,15 +789,15 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ble 38f\n"
"ld1b { z31.b }, p2/Z, [x28]\n"
"ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"udot z16.s, z31.b, z0.b[3]\n"
"udot z20.s, z31.b, z1.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z24.s, z31.b, z2.b[3]\n"
"udot z17.s, z30.b, z0.b[3]\n"
"udot z21.s, z30.b, z1.b[3]\n"
"udot z25.s, z30.b, z2.b[3]\n"
+ "addvl x28, x28, #4\n"
"udot z18.s, z29.b, z0.b[3]\n"
"udot z22.s, z29.b, z1.b[3]\n"
"udot z26.s, z29.b, z2.b[3]\n"
@@ -815,22 +815,22 @@ void sve_hybrid_u8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"tbnz %x[flags], #31, 40f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z28.s, p2/M, z28.s\n"
"uaddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
"uaddv d12, p0, z12.s\n"
"uaddv d13, p0, z13.s\n"
- "mov z11.s, z11.s[0]\n"
"mov z12.s, z12.s[0]\n"
+ "mov z13.s, z13.s[0]\n"
+ "neg z28.s, p2/M, z28.s\n"
"mul z11.s, p2/M, z11.s, z28.s\n"
"mul z12.s, p2/M, z12.s, z28.s\n"
- "mov z13.s, z13.s[0]\n"
"mul z13.s, p2/M, z13.s, z28.s\n"
"40:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
@@ -885,18 +885,18 @@ void sve_hybrid_u8qa_dot_4x4VL (
"and z30.d, z18.d, z0.d\n"
"and z29.d, z19.d, z0.d\n"
"and z28.d, z20.d, z0.d\n"
- "and z3.d, z21.d, z0.d\n"
"asr z1.s, z1.s, #0x1f\n"
"asr z31.s, z31.s, #0x1f\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
- "and z2.d, z22.d, z0.d\n"
"sqadd z16.s, z16.s, z1.s\n"
"sqadd z17.s, z17.s, z31.s\n"
"sqadd z18.s, z18.s, z30.s\n"
"sqadd z19.s, z19.s, z29.s\n"
"sqadd z20.s, z20.s, z28.s\n"
+ "and z3.d, z21.d, z0.d\n"
+ "and z2.d, z22.d, z0.d\n"
"and z1.d, z23.d, z0.d\n"
"and z31.d, z24.d, z0.d\n"
"and z30.d, z25.d, z0.d\n"
@@ -918,35 +918,35 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z27.s, z27.s, z28.s\n"
"41:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z30.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z28.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z28.s\n"
+ "add z18.s, z18.s, z28.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z20.s, z20.s, z28.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z30.s\n"
+ "add z21.s, z21.s, z28.s\n"
+ "add z22.s, z22.s, z28.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z30.s\n"
- "add z18.s, z18.s, z30.s\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z24.s, z24.s, z28.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z30.s\n"
- "add z20.s, z20.s, z30.s\n"
+ "add z25.s, z25.s, z28.s\n"
+ "add z26.s, z26.s, z28.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z21.s, z21.s, z30.s\n"
- "add z22.s, z22.s, z30.s\n"
"ld1rw { z29.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z30.s\n"
- "add z24.s, z24.s, z30.s\n"
+ "add z27.s, z27.s, z28.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z30.s\n"
- "add z26.s, z26.s, z30.s\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z30.s\n"
"smin z16.s, p2/M, z16.s, z29.s\n"
"smin z17.s, p2/M, z17.s, z29.s\n"
"smin z18.s, p2/M, z18.s, z29.s\n"
@@ -962,28 +962,28 @@ void sve_hybrid_u8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z28.s\n"
"smax z17.s, p2/M, z17.s, z28.s\n"
"smax z18.s, p2/M, z18.s, z28.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"smax z19.s, p2/M, z19.s, z28.s\n"
"smax z20.s, p2/M, z20.s, z28.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z21.s, p2/M, z21.s, z28.s\n"
"smax z22.s, p2/M, z22.s, z28.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "st1b { z16.b }, p1, [x27]\n"
"smax z23.s, p2/M, z23.s, z28.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
"smax z24.s, p2/M, z24.s, z28.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"smax z25.s, p2/M, z25.s, z28.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
"smax z26.s, p2/M, z26.s, z28.s\n"
- "smax z27.s, p2/M, z27.s, z28.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z18.h, z22.h, z23.h\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "uzp1 z20.b, z20.b, z18.b\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z20.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z28.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x22]\n"
"addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z20.b }, p1, [x24]\n"
- "st1b { z24.b }, p1, [x23]\n"
"42:" // Height 3: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -991,28 +991,27 @@ void sve_hybrid_u8qa_dot_4x4VL (
"b 58f\n"
"43:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"mov z15.b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -1027,8 +1026,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov x26, #0x0\n"
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1053,37 +1052,37 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ble 51f\n"
"49:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
"ld1rqb { z3.b }, p0/Z, [x21]\n"
+ "add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
"udot z16.s, z5.b, z0.b[0]\n"
"udot z20.s, z5.b, z1.b[0]\n"
- "udot z17.s, z10.b, z0.b[0]\n"
- "udot z21.s, z10.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z24.s, z5.b, z2.b[0]\n"
"udot z28.s, z5.b, z3.b[0]\n"
+ "udot z17.s, z4.b, z0.b[0]\n"
+ "udot z21.s, z4.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "udot z25.s, z4.b, z2.b[0]\n"
+ "udot z29.s, z4.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"addvl x28, x28, #16\n"
- "udot z25.s, z10.b, z2.b[0]\n"
- "udot z29.s, z10.b, z3.b[0]\n"
- "udot z18.s, z4.b, z0.b[0]\n"
- "udot z22.s, z4.b, z1.b[0]\n"
- "udot z26.s, z4.b, z2.b[0]\n"
- "udot z30.s, z4.b, z3.b[0]\n"
"ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
"ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"udot z19.s, z9.b, z0.b[0]\n"
"udot z23.s, z9.b, z1.b[0]\n"
"udot z27.s, z9.b, z2.b[0]\n"
@@ -1153,26 +1152,26 @@ void sve_hybrid_u8qa_dot_4x4VL (
"bgt 49b\n"
"51:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z7.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
"ld1rqb { z3.b }, p0/Z, [x21]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"udot z16.s, z7.b, z0.b[0]\n"
"udot z20.s, z7.b, z1.b[0]\n"
- "udot z17.s, z6.b, z0.b[0]\n"
- "udot z21.s, z6.b, z1.b[0]\n"
- "udot z18.s, z5.b, z0.b[0]\n"
- "udot z22.s, z5.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z24.s, z7.b, z2.b[0]\n"
"udot z28.s, z7.b, z3.b[0]\n"
+ "udot z17.s, z6.b, z0.b[0]\n"
+ "udot z21.s, z6.b, z1.b[0]\n"
+ "addvl x28, x28, #4\n"
"udot z25.s, z6.b, z2.b[0]\n"
"udot z29.s, z6.b, z3.b[0]\n"
+ "udot z18.s, z5.b, z0.b[0]\n"
+ "udot z22.s, z5.b, z1.b[0]\n"
"udot z26.s, z5.b, z2.b[0]\n"
"udot z30.s, z5.b, z3.b[0]\n"
"udot z19.s, z4.b, z0.b[0]\n"
@@ -1183,14 +1182,14 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z7.b, z0.b[1]\n"
"ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z7.b, z0.b[1]\n"
"udot z20.s, z7.b, z1.b[1]\n"
"udot z24.s, z7.b, z2.b[1]\n"
"udot z28.s, z7.b, z3.b[1]\n"
"udot z17.s, z6.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
"udot z21.s, z6.b, z1.b[1]\n"
"udot z25.s, z6.b, z2.b[1]\n"
"udot z29.s, z6.b, z3.b[1]\n"
@@ -1206,14 +1205,14 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
"ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "udot z16.s, z7.b, z0.b[2]\n"
"udot z20.s, z7.b, z1.b[2]\n"
"udot z24.s, z7.b, z2.b[2]\n"
"udot z28.s, z7.b, z3.b[2]\n"
"udot z17.s, z6.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
"udot z21.s, z6.b, z1.b[2]\n"
"udot z25.s, z6.b, z2.b[2]\n"
"udot z29.s, z6.b, z3.b[2]\n"
@@ -1228,15 +1227,15 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ble 52f\n"
"ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
"udot z16.s, z7.b, z0.b[3]\n"
"udot z20.s, z7.b, z1.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z24.s, z7.b, z2.b[3]\n"
"udot z28.s, z7.b, z3.b[3]\n"
"udot z17.s, z6.b, z0.b[3]\n"
"udot z21.s, z6.b, z1.b[3]\n"
+ "addvl x28, x28, #4\n"
"udot z25.s, z6.b, z2.b[3]\n"
"udot z29.s, z6.b, z3.b[3]\n"
"udot z18.s, z5.b, z0.b[3]\n"
@@ -1259,25 +1258,25 @@ void sve_hybrid_u8qa_dot_4x4VL (
"cmp x26, x20\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
+ "add x23, x27, x20\n"
"add x22, x23, x20\n"
+ "add x21, x22, x20\n"
"tbnz %x[flags], #31, 54f\n"
- "mov x21, #0x4\n"
+ "mov x20, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z0.s, p2/M, z0.s\n"
"uaddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
"uaddv d12, p0, z12.s\n"
"uaddv d13, p0, z13.s\n"
- "uaddv d14, p0, z14.s\n"
- "mov z11.s, z11.s[0]\n"
"mov z12.s, z12.s[0]\n"
- "mul z11.s, p2/M, z11.s, z0.s\n"
- "mul z12.s, p2/M, z12.s, z0.s\n"
"mov z13.s, z13.s[0]\n"
+ "uaddv d14, p0, z14.s\n"
+ "neg z0.s, p2/M, z0.s\n"
"mov z14.s, z14.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
+ "mul z12.s, p2/M, z12.s, z0.s\n"
"mul z13.s, p2/M, z13.s, z0.s\n"
"mul z14.s, p2/M, z14.s, z0.s\n"
"54:" // Height 4: skip row sum fixup
@@ -1342,32 +1341,32 @@ void sve_hybrid_u8qa_dot_4x4VL (
"tbz %x[flags], #5, 55f\n"
"and z2.d, z16.d, z0.d\n"
"and z1.d, z17.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z17.s, z17.s, z1.s\n"
"and z7.d, z18.d, z0.d\n"
"and z6.d, z19.d, z0.d\n"
"and z5.d, z20.d, z0.d\n"
"and z4.d, z21.d, z0.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
"and z3.d, z22.d, z0.d\n"
+ "and z2.d, z23.d, z0.d\n"
+ "and z1.d, z24.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z17.s, z17.s, z1.s\n"
- "and z2.d, z23.d, z0.d\n"
- "and z1.d, z24.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"asr z3.s, z3.s, #0x1f\n"
- "sqadd z18.s, z18.s, z7.s\n"
- "sqadd z19.s, z19.s, z6.s\n"
"asr z2.s, z2.s, #0x1f\n"
"asr z1.s, z1.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z7.s\n"
+ "sqadd z19.s, z19.s, z6.s\n"
"sqadd z20.s, z20.s, z5.s\n"
"sqadd z21.s, z21.s, z4.s\n"
"sqadd z22.s, z22.s, z3.s\n"
- "and z7.d, z25.d, z0.d\n"
"sqadd z23.s, z23.s, z2.s\n"
"sqadd z24.s, z24.s, z1.s\n"
+ "and z7.d, z25.d, z0.d\n"
"and z6.d, z26.d, z0.d\n"
"and z5.d, z27.d, z0.d\n"
"and z4.d, z28.d, z0.d\n"
@@ -1390,43 +1389,43 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z31.s, z31.s, z1.s\n"
"55:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"ld1rw { z2.s }, p2/Z, [x20]\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z20.s, z20.s, z2.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z2.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z2.s\n"
- "add z18.s, z18.s, z2.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z20.s, z20.s, z2.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z2.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
- "add z23.s, z23.s, z2.s\n"
- "add z24.s, z24.s, z2.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z30.s, z30.s, z2.s\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z25.s, z25.s, z2.s\n"
- "add z26.s, z26.s, z2.s\n"
"ld1rw { z1.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z2.s\n"
- "add z28.s, z28.s, z2.s\n"
+ "add z31.s, z31.s, z2.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z29.s, z29.s, z2.s\n"
- "add z30.s, z30.s, z2.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z31.s, z31.s, z2.s\n"
"smin z16.s, p2/M, z16.s, z1.s\n"
"smin z17.s, p2/M, z17.s, z1.s\n"
"smin z18.s, p2/M, z18.s, z1.s\n"
@@ -1446,36 +1445,36 @@ void sve_hybrid_u8qa_dot_4x4VL (
"smax z16.s, p2/M, z16.s, z0.s\n"
"smax z17.s, p2/M, z17.s, z0.s\n"
"smax z18.s, p2/M, z18.s, z0.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
"smax z19.s, p2/M, z19.s, z0.s\n"
"smax z20.s, p2/M, z20.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z21.s, p2/M, z21.s, z0.s\n"
"smax z22.s, p2/M, z22.s, z0.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "st1b { z16.b }, p1, [x27]\n"
"smax z23.s, p2/M, z23.s, z0.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
"smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"smax z25.s, p2/M, z25.s, z0.s\n"
- "uzp1 z18.h, z18.h, z19.h\n"
"smax z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "st1b { z20.b }, p1, [x23]\n"
"smax z27.s, p2/M, z27.s, z0.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
"smax z28.s, p2/M, z28.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"smax z29.s, p2/M, z29.s, z0.s\n"
- "uzp1 z17.h, z22.h, z23.h\n"
"smax z30.s, p2/M, z30.s, z0.s\n"
- "smax z31.s, p2/M, z31.s, z0.s\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z18.h, z26.h, z27.h\n"
"uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z20.b, z20.b, z17.b\n"
- "uzp1 z17.h, z30.h, z31.h\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z24.b }, p1, [x22]\n"
+ "smax z31.s, p2/M, z31.s, z0.s\n"
+ "uzp1 z16.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z16.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
"addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z18.b\n"
- "uzp1 z28.b, z28.b, z17.b\n"
- "st1b { z20.b }, p1, [x24]\n"
- "st1b { z24.b }, p1, [x23]\n"
- "st1b { z28.b }, p1, [x22]\n"
"56:" // Height 4: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -1492,8 +1491,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
index ab37e6ad5b..e37ade4a00 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
@@ -70,7 +70,7 @@ public:
return false;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 8, 8> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 4, 8, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
index 212e178065..7b67ccd545 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
@@ -45,18 +45,18 @@ void sve_hybrid_u8qa_mmla_4x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -91,24 +91,24 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"3:" // Height 1: setup done
"mov x26, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -124,43 +124,43 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ble 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z30.b }, p2/Z, [x28]\n"
- "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "trn1 z0.d, z1.d, z31.d\n"
- ".inst 0x45de9810 // ummla z16.s, z0.b, z30.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99814 // ummla z20.s, z0.b, z25.b\n"
+ ".inst 0x45d89811 // ummla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45da9815 // ummla z21.s, z0.b, z26.b\n"
+ ".inst 0x45d99812 // ummla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "trn2 z1.d, z1.d, z31.d\n"
- ".inst 0x45dd9814 // ummla z20.s, z0.b, z29.b\n"
- ".inst 0x45dc9811 // ummla z17.s, z0.b, z28.b\n"
- ".inst 0x45db9815 // ummla z21.s, z0.b, z27.b\n"
- ".inst 0x45da9812 // ummla z18.s, z0.b, z26.b\n"
- "ld1b { z31.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45d99816 // ummla z22.s, z0.b, z25.b\n"
- ".inst 0x45d89813 // ummla z19.s, z0.b, z24.b\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x45c89817 // ummla z23.s, z0.b, z8.b\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45d89816 // ummla z22.s, z0.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45da9813 // ummla z19.s, z0.b, z26.b\n"
+ ".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45da9834 // ummla z20.s, z1.b, z26.b\n"
"ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x45df9830 // ummla z16.s, z1.b, z31.b\n"
"ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n"
- ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n"
- ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n"
".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n"
".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n"
".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
+ "add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z11.s, z1.b, z15.b\n"
@@ -170,45 +170,45 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bgt 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
"ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x8\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x45da9814 // ummla z20.s, z0.b, z26.b\n"
"ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
"ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ ".inst 0x45d99811 // ummla z17.s, z0.b, z25.b\n"
+ ".inst 0x45d89815 // ummla z21.s, z0.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z31.d\n"
- ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #8\n"
- "trn2 z1.d, z1.d, z31.d\n"
- ".inst 0x45de9814 // ummla z20.s, z0.b, z30.b\n"
- ".inst 0x45dd9811 // ummla z17.s, z0.b, z29.b\n"
- ".inst 0x45dc9815 // ummla z21.s, z0.b, z28.b\n"
".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n"
".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n"
".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n"
".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"ble 10f\n"
"ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45d89834 // ummla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99832 // ummla z18.s, z1.b, z25.b\n"
+ ".inst 0x45d89836 // ummla z22.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n"
- ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n"
- ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n"
- ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n"
".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -225,32 +225,32 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov z23.d, z16.d\n"
"tbnz %x[flags], #31, 12f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "ld1rw { z9.s }, p2/Z, [x20]\n"
- "neg z9.s, p2/M, z9.s\n"
+ "neg z16.s, p2/M, z16.s\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z9.s\n"
+ "mul z11.s, p2/M, z11.s, z16.s\n"
"12:" // Height 1: skip row sum fixup
"add z23.s, z23.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
"ld1w { z22.s }, p2/Z, [x10]\n"
- "ld1w { z24.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z23.s, z23.s, z22.s\n"
- "add z17.s, z17.s, z24.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z20.s\n"
+ "add z19.s, z19.s, z16.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z21.s\n"
- "add z19.s, z19.s, z20.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n"
+ "addvl x10, x10, #4\n"
".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
"tbz %x[flags], #5, 13f\n"
@@ -268,19 +268,19 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"sqadd z19.s, z19.s, z16.s\n"
"13:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z16.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "add z17.s, z17.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
"ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z21.s\n"
+ "add z19.s, z19.s, z16.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z21.s\n"
- "add z18.s, z18.s, z21.s\n"
"ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z21.s\n"
"smin z23.s, p2/M, z23.s, z20.s\n"
"smin z17.s, p2/M, z17.s, z20.s\n"
"smin z18.s, p2/M, z18.s, z20.s\n"
@@ -288,8 +288,8 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"smax z23.s, p2/M, z23.s, z16.s\n"
"smax z17.s, p2/M, z17.s, z16.s\n"
"smax z18.s, p2/M, z18.s, z16.s\n"
- "smax z19.s, p2/M, z19.s, z16.s\n"
"uzp1 z23.h, z23.h, z17.h\n"
+ "smax z19.s, p2/M, z19.s, z16.s\n"
"uzp1 z16.h, z18.h, z19.h\n"
"uzp1 z23.b, z23.b, z16.b\n"
"st1b { z23.b }, p1, [x27]\n"
@@ -307,24 +307,24 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"16:" // Height 2: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"17:" // Height 2: setup done
"mov x26, #0x0\n"
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -343,45 +343,45 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ble 23f\n"
"21:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z25.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- ".inst 0x45df9810 // ummla z16.s, z0.b, z31.b\n"
+ "ld1rqb { z26.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99814 // ummla z20.s, z0.b, z25.b\n"
+ ".inst 0x45d89811 // ummla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45da9815 // ummla z21.s, z0.b, z26.b\n"
+ ".inst 0x45d99812 // ummla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45de9814 // ummla z20.s, z0.b, z30.b\n"
- ".inst 0x45dd9811 // ummla z17.s, z0.b, z29.b\n"
- ".inst 0x45dc9815 // ummla z21.s, z0.b, z28.b\n"
- ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n"
- ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n"
- ".inst 0x45d89813 // ummla z19.s, z0.b, z24.b\n"
+ ".inst 0x45d89816 // ummla z22.s, z0.b, z24.b\n"
"ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45da9813 // ummla z19.s, z0.b, z26.b\n"
".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45da9834 // ummla z20.s, z1.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
"ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n"
- ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n"
- ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n"
".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n"
".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n"
".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"tbnz %x[flags], #31, 22f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z11.s, z1.b, z15.b\n"
@@ -391,46 +391,46 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bgt 21b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z29.b }, p2/Z, [x28]\n"
- "ld1b { z28.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x25, x25, #0x8\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x45da9814 // ummla z20.s, z0.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99811 // ummla z17.s, z0.b, z25.b\n"
+ ".inst 0x45d89815 // ummla z21.s, z0.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z24.d\n"
- "trn2 z1.d, z1.d, z24.d\n"
- ".inst 0x45dd9810 // ummla z16.s, z0.b, z29.b\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45dc9814 // ummla z20.s, z0.b, z28.b\n"
- ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n"
- ".inst 0x45db9815 // ummla z21.s, z0.b, z27.b\n"
- ".inst 0x45da9812 // ummla z18.s, z0.b, z26.b\n"
- ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n"
+ ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n"
+ ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n"
".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n"
".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"ble 24f\n"
"ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45d89834 // ummla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99832 // ummla z18.s, z1.b, z25.b\n"
+ ".inst 0x45d89836 // ummla z22.s, z1.b, z24.b\n"
"ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n"
- ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n"
- ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n"
- ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n"
".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
+ "addvl x28, x28, #8\n"
"24:" // Height 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 25f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -443,18 +443,18 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"uzp1 z24.d, z16.d, z20.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp2 z16.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x23, x27, x20\n"
"mov z23.d, z24.d\n"
"tbnz %x[flags], #31, 26f\n"
"add x20, %x[qp], %[b_offset]\n"
- ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
+ ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
"neg z24.s, p2/M, z24.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
@@ -497,24 +497,24 @@ void sve_hybrid_u8qa_mmla_4x4VL (
".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
"and z24.d, z23.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z24.s\n"
"and z30.d, z20.d, z0.d\n"
"and z29.d, z21.d, z0.d\n"
"and z28.d, z22.d, z0.d\n"
"and z27.d, z16.d, z0.d\n"
"and z26.d, z17.d, z0.d\n"
- "asr z24.s, z24.s, #0x1f\n"
"and z25.d, z18.d, z0.d\n"
+ "and z24.d, z19.d, z0.d\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"asr z27.s, z27.s, #0x1f\n"
- "sqadd z23.s, z23.s, z24.s\n"
- "and z24.d, z19.d, z0.d\n"
"asr z26.s, z26.s, #0x1f\n"
"asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
"sqadd z20.s, z20.s, z30.s\n"
"sqadd z21.s, z21.s, z29.s\n"
- "asr z24.s, z24.s, #0x1f\n"
"sqadd z22.s, z22.s, z28.s\n"
"sqadd z16.s, z16.s, z27.s\n"
"sqadd z17.s, z17.s, z26.s\n"
@@ -522,27 +522,27 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"sqadd z19.s, z19.s, z24.s\n"
"27:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z24.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "add z20.s, z20.s, z24.s\n"
+ "add z21.s, z21.s, z24.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z22.s, z22.s, z24.s\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z23.s, z23.s, z26.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z20.s, z20.s, z26.s\n"
- "add z21.s, z21.s, z26.s\n"
"ld1rw { z25.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z26.s\n"
- "add z16.s, z16.s, z26.s\n"
+ "add z19.s, z19.s, z24.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z26.s\n"
- "add z18.s, z18.s, z26.s\n"
"ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z26.s\n"
"smin z23.s, p2/M, z23.s, z25.s\n"
"smin z20.s, p2/M, z20.s, z25.s\n"
"smin z21.s, p2/M, z21.s, z25.s\n"
@@ -554,20 +554,20 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"smax z23.s, p2/M, z23.s, z24.s\n"
"smax z20.s, p2/M, z20.s, z24.s\n"
"smax z21.s, p2/M, z21.s, z24.s\n"
+ "uzp1 z23.h, z23.h, z20.h\n"
"smax z22.s, p2/M, z22.s, z24.s\n"
"smax z16.s, p2/M, z16.s, z24.s\n"
+ "uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z23.b, z23.b, z20.b\n"
"smax z17.s, p2/M, z17.s, z24.s\n"
"smax z18.s, p2/M, z18.s, z24.s\n"
- "smax z19.s, p2/M, z19.s, z24.s\n"
- "uzp1 z23.h, z23.h, z20.h\n"
- "uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "st1b { z23.b }, p1, [x27]\n"
+ "smax z19.s, p2/M, z19.s, z24.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z23.b, z23.b, z20.b\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z23.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
"st1b { z16.b }, p1, [x23]\n"
+ "addvl x27, x27, #1\n"
"28:" // Height 2: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -582,16 +582,16 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov z15.b, #0x1\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
"30:" // Height 3: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -606,8 +606,8 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov x26, #0x0\n"
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -629,49 +629,49 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ble 37f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x22, x22, #0x10\n"
"trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z6.d\n"
- "trn2 z3.d, z3.d, z6.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45c59814 // ummla z20.s, z0.b, z5.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45c5985c // ummla z28.s, z2.b, z5.b\n"
+ ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n"
"ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- ".inst 0x45ca9814 // ummla z20.s, z0.b, z10.b\n"
- ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- ".inst 0x45c49812 // ummla z18.s, z0.b, z4.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45ca985c // ummla z28.s, z2.b, z10.b\n"
- ".inst 0x45c99859 // ummla z25.s, z2.b, z9.b\n"
- ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
- ".inst 0x45c4985a // ummla z26.s, z2.b, z4.b\n"
- ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n"
- ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n"
+ ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n"
+ ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n"
"ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n"
+ ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n"
"ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
"ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n"
+ ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n"
"ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
- ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n"
+ ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n"
"ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
+ ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n"
@@ -699,32 +699,32 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bgt 35b\n"
"37:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z5.d\n"
- "trn2 z3.d, z3.d, z5.d\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "trn1 z2.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z4.d\n"
+ ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
+ "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "subs x25, x25, #0x8\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n"
+ ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
- ".inst 0x45ca9814 // ummla z20.s, z0.b, z10.b\n"
- ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n"
- ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45ca985c // ummla z28.s, z2.b, z10.b\n"
- "addvl x28, x28, #8\n"
+ ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n"
".inst 0x45c99859 // ummla z25.s, z2.b, z9.b\n"
+ ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
+ "addvl x28, x28, #8\n"
+ ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n"
".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n"
".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n"
".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n"
@@ -734,24 +734,24 @@ void sve_hybrid_u8qa_mmla_4x4VL (
".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n"
"ble 38f\n"
"ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45c59834 // ummla z20.s, z1.b, z5.b\n"
+ ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n"
"ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
"ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
- ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ ".inst 0x45c49831 // ummla z17.s, z1.b, z4.b\n"
+ ".inst 0x45c49879 // ummla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n"
- ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n"
- ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n"
- ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n"
- "addvl x28, x28, #8\n"
".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n"
+ "addvl x28, x28, #8\n"
".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n"
".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
@@ -771,15 +771,15 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
+ "add x22, x23, x20\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x27, x20\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x22, x23, x20\n"
"uzp1 z24.d, z24.d, z28.d\n"
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
@@ -787,14 +787,14 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 40f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
"neg z23.s, p2/M, z23.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mov z13.s, z13.s[0]\n"
"mul z11.s, p2/M, z11.s, z23.s\n"
+ "mov z13.s, z13.s[0]\n"
"mul z12.s, p2/M, z12.s, z23.s\n"
"mul z13.s, p2/M, z13.s, z23.s\n"
"40:" // Height 3: skip row sum fixup
@@ -850,18 +850,18 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"and z29.d, z21.d, z0.d\n"
"and z28.d, z22.d, z0.d\n"
"and z23.d, z16.d, z0.d\n"
- "and z3.d, z17.d, z0.d\n"
"asr z1.s, z1.s, #0x1f\n"
"asr z30.s, z30.s, #0x1f\n"
"asr z29.s, z29.s, #0x1f\n"
"asr z28.s, z28.s, #0x1f\n"
"asr z23.s, z23.s, #0x1f\n"
- "and z2.d, z18.d, z0.d\n"
"sqadd z31.s, z31.s, z1.s\n"
"sqadd z20.s, z20.s, z30.s\n"
"sqadd z21.s, z21.s, z29.s\n"
"sqadd z22.s, z22.s, z28.s\n"
"sqadd z16.s, z16.s, z23.s\n"
+ "and z3.d, z17.d, z0.d\n"
+ "and z2.d, z18.d, z0.d\n"
"and z1.d, z19.d, z0.d\n"
"and z30.d, z24.d, z0.d\n"
"and z29.d, z25.d, z0.d\n"
@@ -883,35 +883,35 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"sqadd z27.s, z27.s, z23.s\n"
"41:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
+ "add z31.s, z31.s, z23.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "add z20.s, z20.s, z23.s\n"
+ "add z21.s, z21.s, z23.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z22.s, z22.s, z23.s\n"
+ "add z16.s, z16.s, z23.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z31.s, z31.s, z29.s\n"
+ "add z17.s, z17.s, z23.s\n"
+ "add z18.s, z18.s, z23.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z20.s, z20.s, z29.s\n"
- "add z21.s, z21.s, z29.s\n"
+ "add z19.s, z19.s, z23.s\n"
+ "add z24.s, z24.s, z23.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z22.s, z22.s, z29.s\n"
- "add z16.s, z16.s, z29.s\n"
+ "add z25.s, z25.s, z23.s\n"
+ "add z26.s, z26.s, z23.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z29.s\n"
- "add z18.s, z18.s, z29.s\n"
"ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z29.s\n"
- "add z24.s, z24.s, z29.s\n"
+ "add z27.s, z27.s, z23.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z29.s\n"
- "add z26.s, z26.s, z29.s\n"
"ld1rw { z23.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z29.s\n"
"smin z31.s, p2/M, z31.s, z28.s\n"
"smin z20.s, p2/M, z20.s, z28.s\n"
"smin z21.s, p2/M, z21.s, z28.s\n"
@@ -927,28 +927,28 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"smax z31.s, p2/M, z31.s, z23.s\n"
"smax z20.s, p2/M, z20.s, z23.s\n"
"smax z21.s, p2/M, z21.s, z23.s\n"
+ "uzp1 z31.h, z31.h, z20.h\n"
"smax z22.s, p2/M, z22.s, z23.s\n"
"smax z16.s, p2/M, z16.s, z23.s\n"
+ "uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z31.b, z31.b, z20.b\n"
"smax z17.s, p2/M, z17.s, z23.s\n"
"smax z18.s, p2/M, z18.s, z23.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "st1b { z31.b }, p1, [x27]\n"
"smax z19.s, p2/M, z19.s, z23.s\n"
- "uzp1 z31.h, z31.h, z20.h\n"
"smax z24.s, p2/M, z24.s, z23.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z25.s, p2/M, z25.s, z23.s\n"
- "uzp1 z20.h, z21.h, z22.h\n"
"smax z26.s, p2/M, z26.s, z23.s\n"
- "smax z27.s, p2/M, z27.s, z23.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z18.h, z18.h, z19.h\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z31.b, z31.b, z20.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "st1b { z31.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z17.b\n"
"st1b { z16.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z23.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x22]\n"
+ "addvl x27, x27, #1\n"
"42:" // Height 3: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -956,28 +956,27 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"b 58f\n"
"43:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x4\n"
"mov x10, %x[col_bias]\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
"mov z15.b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[output_ptr]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"44:" // Height 4: Column loop
"mov x20, #0x0\n"
+ "whilelt p1.b, x20, x9\n"
"mov z16.s, #0x0\n"
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
@@ -992,8 +991,8 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov x26, #0x0\n"
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
"ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1018,56 +1017,56 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ble 51f\n"
"49:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
"trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z5.b }, p0/Z, [x21]\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z6.d\n"
- "trn2 z3.d, z3.d, z6.d\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
+ ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n"
- ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
+ ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x45c59811 // ummla z17.s, z0.b, z5.b\n"
+ ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x45c49815 // ummla z21.s, z0.b, z4.b\n"
+ ".inst 0x45c4985d // ummla z29.s, z2.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
- ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n"
- ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n"
- ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n"
+ ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n"
+ ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n"
+ ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n"
"ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
"ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
+ ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n"
+ ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n"
"ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n"
+ ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "add x24, x24, #0x10\n"
".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
"ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n"
+ "add x22, x22, #0x10\n"
".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n"
".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n"
+ "add x21, x21, #0x10\n"
".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n"
".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
@@ -1090,60 +1089,60 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bgt 49b\n"
"51:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z2.d\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
- "trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
"trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
"trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
+ ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "subs x25, x25, #0x8\n"
+ ".inst 0x45c59814 // ummla z20.s, z0.b, z5.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x45c5985c // ummla z28.s, z2.b, z5.b\n"
+ ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45c69810 // ummla z16.s, z0.b, z6.b\n"
- ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n"
- ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n"
- ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
+ ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
+ ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
- ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n"
- ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n"
+ ".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n"
+ ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n"
+ ".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n"
".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n"
".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n"
- ".inst 0x45c69817 // ummla z23.s, z0.b, z6.b\n"
- ".inst 0x45c6985f // ummla z31.s, z2.b, z6.b\n"
+ ".inst 0x45c49817 // ummla z23.s, z0.b, z4.b\n"
+ ".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n"
"ble 52f\n"
"ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45c59834 // ummla z20.s, z1.b, z5.b\n"
+ ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n"
"ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
"ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
- ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ ".inst 0x45c49831 // ummla z17.s, z1.b, z4.b\n"
+ ".inst 0x45c49879 // ummla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n"
- ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n"
- ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n"
- ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n"
- "addvl x28, x28, #8\n"
".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n"
+ "addvl x28, x28, #8\n"
".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n"
".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
@@ -1163,16 +1162,16 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
+ "add x21, x22, x20\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x27, x20\n"
- "add x22, x23, x20\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "add x21, x22, x20\n"
"uzp1 z23.d, z24.d, z28.d\n"
"uzp2 z24.d, z24.d, z28.d\n"
"uzp1 z28.d, z25.d, z29.d\n"
@@ -1184,15 +1183,15 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 54f\n"
"add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
"neg z0.s, p2/M, z0.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
"mov z14.s, z13.s[3]\n"
"mov z13.s, z13.s[0]\n"
- "mul z11.s, p2/M, z11.s, z0.s\n"
"mul z12.s, p2/M, z12.s, z0.s\n"
"mul z13.s, p2/M, z13.s, z0.s\n"
"mul z14.s, p2/M, z14.s, z0.s\n"
@@ -1258,32 +1257,32 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"tbz %x[flags], #5, 55f\n"
"and z2.d, z31.d, z0.d\n"
"and z1.d, z20.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z2.s\n"
+ "sqadd z20.s, z20.s, z1.s\n"
"and z7.d, z21.d, z0.d\n"
"and z6.d, z22.d, z0.d\n"
"and z5.d, z16.d, z0.d\n"
"and z4.d, z17.d, z0.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
"and z3.d, z18.d, z0.d\n"
+ "and z2.d, z19.d, z0.d\n"
+ "and z1.d, z23.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z31.s, z31.s, z2.s\n"
- "sqadd z20.s, z20.s, z1.s\n"
- "and z2.d, z19.d, z0.d\n"
- "and z1.d, z23.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"asr z3.s, z3.s, #0x1f\n"
- "sqadd z21.s, z21.s, z7.s\n"
- "sqadd z22.s, z22.s, z6.s\n"
"asr z2.s, z2.s, #0x1f\n"
"asr z1.s, z1.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z7.s\n"
+ "sqadd z22.s, z22.s, z6.s\n"
"sqadd z16.s, z16.s, z5.s\n"
"sqadd z17.s, z17.s, z4.s\n"
"sqadd z18.s, z18.s, z3.s\n"
- "and z7.d, z28.d, z0.d\n"
"sqadd z19.s, z19.s, z2.s\n"
"sqadd z23.s, z23.s, z1.s\n"
+ "and z7.d, z28.d, z0.d\n"
"and z6.d, z29.d, z0.d\n"
"and z5.d, z30.d, z0.d\n"
"and z4.d, z24.d, z0.d\n"
@@ -1306,43 +1305,43 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"sqadd z27.s, z27.s, z1.s\n"
"55:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"ld1rw { z2.s }, p2/Z, [x20]\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "add z31.s, z31.s, z2.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "add z20.s, z20.s, z2.s\n"
+ "add z21.s, z21.s, z2.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z31.s, z31.s, z2.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z20.s, z20.s, z2.s\n"
- "add z21.s, z21.s, z2.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z23.s, z23.s, z2.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z16.s, z16.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "add z29.s, z29.s, z2.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z2.s\n"
- "add z18.s, z18.s, z2.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z23.s, z23.s, z2.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "add z28.s, z28.s, z2.s\n"
- "add z29.s, z29.s, z2.s\n"
"ld1rw { z1.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z2.s\n"
- "add z24.s, z24.s, z2.s\n"
+ "add z27.s, z27.s, z2.s\n"
"add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z2.s\n"
- "add z26.s, z26.s, z2.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z2.s\n"
"smin z31.s, p2/M, z31.s, z1.s\n"
"smin z20.s, p2/M, z20.s, z1.s\n"
"smin z21.s, p2/M, z21.s, z1.s\n"
@@ -1362,36 +1361,36 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"smax z31.s, p2/M, z31.s, z0.s\n"
"smax z20.s, p2/M, z20.s, z0.s\n"
"smax z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z31.h, z31.h, z20.h\n"
"smax z22.s, p2/M, z22.s, z0.s\n"
"smax z16.s, p2/M, z16.s, z0.s\n"
+ "uzp1 z20.h, z21.h, z22.h\n"
+ "uzp1 z31.b, z31.b, z20.b\n"
"smax z17.s, p2/M, z17.s, z0.s\n"
"smax z18.s, p2/M, z18.s, z0.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "st1b { z31.b }, p1, [x27]\n"
"smax z19.s, p2/M, z19.s, z0.s\n"
- "uzp1 z31.h, z31.h, z20.h\n"
"smax z23.s, p2/M, z23.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
"smax z28.s, p2/M, z28.s, z0.s\n"
- "uzp1 z20.h, z21.h, z22.h\n"
"smax z29.s, p2/M, z29.s, z0.s\n"
+ "uzp1 z23.h, z23.h, z28.h\n"
+ "st1b { z16.b }, p1, [x23]\n"
"smax z30.s, p2/M, z30.s, z0.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
"smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z29.h, z30.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
"smax z25.s, p2/M, z25.s, z0.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
"smax z26.s, p2/M, z26.s, z0.s\n"
- "smax z27.s, p2/M, z27.s, z0.s\n"
- "uzp1 z23.h, z23.h, z28.h\n"
- "uzp1 z31.b, z31.b, z20.b\n"
- "uzp1 z18.h, z29.h, z30.h\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "st1b { z31.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z23.b, z23.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z16.b }, p1, [x23]\n"
"st1b { z23.b }, p1, [x22]\n"
+ "smax z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x21]\n"
+ "addvl x27, x27, #1\n"
"56:" // Height 4: Writeback done
"decw x9, ALL, MUL #4\n"
"cmp x9, XZR\n"
@@ -1408,8 +1407,8 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL.hpp
deleted file mode 100644
index ebf1883ccf..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../std_transforms_sve.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<uint8_t>, \
- const Requantize32 *, const int32_t *, unsigned int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void sve_hybrid_u8s8qa_dot_4x4VL( ARGLIST );
-
-class cls_sve_hybrid_u8s8qa_dot_4x4VL
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef uint8_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<int32_t>() * 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return false;
- }
-
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 4, 4> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 29.89 };
- case CPUModel::A510:
- return { 17.12 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_u8s8qa_dot_4x4VL;
- cls_sve_hybrid_u8s8qa_dot_4x4VL(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp
deleted file mode 100644
index dc3b7ef3ec..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp
+++ /dev/null
@@ -1,1502 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void sve_hybrid_u8s8qa_dot_4x4VL (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
- const Requantize32 *qp, const int32_t *col_bias, unsigned int
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- if (qp->c_offset > qp->minval) {
- flags |= 0x20;
- }
- __asm__ __volatile__(
- "ptrue p2.b\n"
- "1:" // Row loop
- "cmp %x[M], #0x4\n"
- "bge 43f\n"
- "cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z15.b, #0x1\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "3:" // Height 1: setup done
- "mov x26, #0x0\n"
- "4:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 5f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "cbnz x26, 6f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "b 6f\n"
- "5:" // Height 1: setup direct input
- "mov x24, %x[input_ptr]\n"
- "6:" // Height 1: input setup done
- "cmp x25, #0x10\n"
- "ble 9f\n"
- "7:" // Height 1: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z21.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z23.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z22.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x44a01eb0 // sudot z16.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x44a01f51 // sudot z17.s, z26.b, z0.b[0]\n"
- ".inst 0x44a01f32 // sudot z18.s, z25.b, z0.b[0]\n"
- ".inst 0x44a01f13 // sudot z19.s, z24.b, z0.b[0]\n"
- ".inst 0x44a81e90 // sudot z16.s, z20.b, z0.b[1]\n"
- "ld1b { z20.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x44a81ef1 // sudot z17.s, z23.b, z0.b[1]\n"
- "ld1b { z23.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x44a81ed2 // sudot z18.s, z22.b, z0.b[1]\n"
- "ld1b { z22.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x44a81eb3 // sudot z19.s, z21.b, z0.b[1]\n"
- "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x44b01e90 // sudot z16.s, z20.b, z0.b[2]\n"
- "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x44b01f51 // sudot z17.s, z26.b, z0.b[2]\n"
- ".inst 0x44b01f32 // sudot z18.s, z25.b, z0.b[2]\n"
- ".inst 0x44b01f13 // sudot z19.s, z24.b, z0.b[2]\n"
- ".inst 0x44b81ef0 // sudot z16.s, z23.b, z0.b[3]\n"
- ".inst 0x44b81ed1 // sudot z17.s, z22.b, z0.b[3]\n"
- ".inst 0x44b81eb2 // sudot z18.s, z21.b, z0.b[3]\n"
- ".inst 0x44b81e93 // sudot z19.s, z20.b, z0.b[3]\n"
- "tbnz %x[flags], #31, 8f\n"
- "udot z11.s, z0.b, z15.b\n"
- "8:" // Height 1: Multiply loop: unique 1: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 7b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- ".inst 0x44a01ef0 // sudot z16.s, z23.b, z0.b[0]\n"
- ".inst 0x44a01ed1 // sudot z17.s, z22.b, z0.b[0]\n"
- ".inst 0x44a01eb2 // sudot z18.s, z21.b, z0.b[0]\n"
- ".inst 0x44a01e93 // sudot z19.s, z20.b, z0.b[0]\n"
- "ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44a81ef0 // sudot z16.s, z23.b, z0.b[1]\n"
- ".inst 0x44a81ed1 // sudot z17.s, z22.b, z0.b[1]\n"
- ".inst 0x44a81eb2 // sudot z18.s, z21.b, z0.b[1]\n"
- ".inst 0x44a81e93 // sudot z19.s, z20.b, z0.b[1]\n"
- "ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b01ef0 // sudot z16.s, z23.b, z0.b[2]\n"
- ".inst 0x44b01ed1 // sudot z17.s, z22.b, z0.b[2]\n"
- ".inst 0x44b01eb2 // sudot z18.s, z21.b, z0.b[2]\n"
- ".inst 0x44b01e93 // sudot z19.s, z20.b, z0.b[2]\n"
- "ble 10f\n"
- "ld1b { z23.b }, p2/Z, [x28]\n"
- "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b81ef0 // sudot z16.s, z23.b, z0.b[3]\n"
- ".inst 0x44b81ed1 // sudot z17.s, z22.b, z0.b[3]\n"
- ".inst 0x44b81eb2 // sudot z18.s, z21.b, z0.b[3]\n"
- ".inst 0x44b81e93 // sudot z19.s, z20.b, z0.b[3]\n"
- "10:" // Height 1: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 11f\n"
- "udot z11.s, z0.b, z15.b\n"
- "11:" // Height 1: Multiply loop: unique 2: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 4b\n"
- "tbnz %x[flags], #31, 12f\n"
- "mov x21, #0x4\n"
- "add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z20.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z20.s, p2/M, z20.s\n"
- "saddv d11, p0, z11.s\n"
- "mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z20.s\n"
- "12:" // Height 1: skip row sum fixup
- "add z16.s, z16.s, z11.s\n"
- "add z17.s, z17.s, z11.s\n"
- "ld1w { z23.s }, p2/Z, [x10]\n"
- "ld1w { z20.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z18.s, z18.s, z11.s\n"
- "add z19.s, z19.s, z11.s\n"
- "ld1w { z22.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z16.s, z16.s, z23.s\n"
- "add z17.s, z17.s, z20.s\n"
- "ld1rw { z20.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z22.s\n"
- "add z19.s, z19.s, z21.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
- ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n"
- ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
- ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
- "tbz %x[flags], #5, 13f\n"
- "and z23.d, z16.d, z0.d\n"
- "and z22.d, z17.d, z0.d\n"
- "and z21.d, z18.d, z0.d\n"
- "and z20.d, z19.d, z0.d\n"
- "asr z23.s, z23.s, #0x1f\n"
- "asr z22.s, z22.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "sqadd z16.s, z16.s, z23.s\n"
- "sqadd z17.s, z17.s, z22.s\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "sqadd z19.s, z19.s, z20.s\n"
- "13:" // Height 1: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z22.s }, p2/Z, [x20]\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1rw { z21.s }, p2/Z, [x20]\n"
- "add z16.s, z16.s, z22.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z22.s\n"
- "add z18.s, z18.s, z22.s\n"
- "ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z22.s\n"
- "smin z16.s, p2/M, z16.s, z21.s\n"
- "smin z17.s, p2/M, z17.s, z21.s\n"
- "smin z18.s, p2/M, z18.s, z21.s\n"
- "smin z19.s, p2/M, z19.s, z21.s\n"
- "smax z16.s, p2/M, z16.s, z20.s\n"
- "smax z17.s, p2/M, z17.s, z20.s\n"
- "smax z18.s, p2/M, z18.s, z20.s\n"
- "smax z19.s, p2/M, z19.s, z20.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "14:" // Height 1: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 2b\n"
- "b 58f\n"
- "15:" // Height 2
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "mov z15.b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "16:" // Height 2: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "17:" // Height 2: setup done
- "mov x26, #0x0\n"
- "18:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 19f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "cbnz x26, 20f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 20f\n"
- "19:" // Height 2: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "20:" // Height 2: input setup done
- "cmp x25, #0x10\n"
- "ble 23f\n"
- "21:" // Height 2: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z25.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x44a01f30 // sudot z16.s, z25.b, z0.b[0]\n"
- ".inst 0x44a11f34 // sudot z20.s, z25.b, z1.b[0]\n"
- "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x44a01fd1 // sudot z17.s, z30.b, z0.b[0]\n"
- ".inst 0x44a11fd5 // sudot z21.s, z30.b, z1.b[0]\n"
- ".inst 0x44a01fb2 // sudot z18.s, z29.b, z0.b[0]\n"
- ".inst 0x44a11fb6 // sudot z22.s, z29.b, z1.b[0]\n"
- ".inst 0x44a01f93 // sudot z19.s, z28.b, z0.b[0]\n"
- ".inst 0x44a11f97 // sudot z23.s, z28.b, z1.b[0]\n"
- ".inst 0x44a81f10 // sudot z16.s, z24.b, z0.b[1]\n"
- ".inst 0x44a91f14 // sudot z20.s, z24.b, z1.b[1]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x44a81f71 // sudot z17.s, z27.b, z0.b[1]\n"
- ".inst 0x44a91f75 // sudot z21.s, z27.b, z1.b[1]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x44a81f52 // sudot z18.s, z26.b, z0.b[1]\n"
- ".inst 0x44a91f56 // sudot z22.s, z26.b, z1.b[1]\n"
- "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x44a81f33 // sudot z19.s, z25.b, z0.b[1]\n"
- ".inst 0x44a91f37 // sudot z23.s, z25.b, z1.b[1]\n"
- "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x44b01f10 // sudot z16.s, z24.b, z0.b[2]\n"
- ".inst 0x44b11f14 // sudot z20.s, z24.b, z1.b[2]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x44b01fd1 // sudot z17.s, z30.b, z0.b[2]\n"
- ".inst 0x44b11fd5 // sudot z21.s, z30.b, z1.b[2]\n"
- ".inst 0x44b01fb2 // sudot z18.s, z29.b, z0.b[2]\n"
- ".inst 0x44b11fb6 // sudot z22.s, z29.b, z1.b[2]\n"
- ".inst 0x44b01f93 // sudot z19.s, z28.b, z0.b[2]\n"
- ".inst 0x44b11f97 // sudot z23.s, z28.b, z1.b[2]\n"
- ".inst 0x44b81f70 // sudot z16.s, z27.b, z0.b[3]\n"
- ".inst 0x44b91f74 // sudot z20.s, z27.b, z1.b[3]\n"
- ".inst 0x44b81f51 // sudot z17.s, z26.b, z0.b[3]\n"
- ".inst 0x44b91f55 // sudot z21.s, z26.b, z1.b[3]\n"
- ".inst 0x44b81f32 // sudot z18.s, z25.b, z0.b[3]\n"
- ".inst 0x44b91f36 // sudot z22.s, z25.b, z1.b[3]\n"
- ".inst 0x44b81f13 // sudot z19.s, z24.b, z0.b[3]\n"
- ".inst 0x44b91f17 // sudot z23.s, z24.b, z1.b[3]\n"
- "tbnz %x[flags], #31, 22f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z12.s, z1.b, z15.b\n"
- "22:" // Height 2: Multiply loop: unique 3: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 21b\n"
- "23:" // Height 2: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- ".inst 0x44a01f70 // sudot z16.s, z27.b, z0.b[0]\n"
- ".inst 0x44a11f74 // sudot z20.s, z27.b, z1.b[0]\n"
- ".inst 0x44a01f51 // sudot z17.s, z26.b, z0.b[0]\n"
- ".inst 0x44a11f55 // sudot z21.s, z26.b, z1.b[0]\n"
- ".inst 0x44a01f32 // sudot z18.s, z25.b, z0.b[0]\n"
- ".inst 0x44a11f36 // sudot z22.s, z25.b, z1.b[0]\n"
- ".inst 0x44a01f13 // sudot z19.s, z24.b, z0.b[0]\n"
- ".inst 0x44a11f17 // sudot z23.s, z24.b, z1.b[0]\n"
- "ble 24f\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44a81f70 // sudot z16.s, z27.b, z0.b[1]\n"
- ".inst 0x44a91f74 // sudot z20.s, z27.b, z1.b[1]\n"
- ".inst 0x44a81f51 // sudot z17.s, z26.b, z0.b[1]\n"
- ".inst 0x44a91f55 // sudot z21.s, z26.b, z1.b[1]\n"
- ".inst 0x44a81f32 // sudot z18.s, z25.b, z0.b[1]\n"
- ".inst 0x44a91f36 // sudot z22.s, z25.b, z1.b[1]\n"
- ".inst 0x44a81f13 // sudot z19.s, z24.b, z0.b[1]\n"
- ".inst 0x44a91f17 // sudot z23.s, z24.b, z1.b[1]\n"
- "ble 24f\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b01f70 // sudot z16.s, z27.b, z0.b[2]\n"
- ".inst 0x44b11f74 // sudot z20.s, z27.b, z1.b[2]\n"
- ".inst 0x44b01f51 // sudot z17.s, z26.b, z0.b[2]\n"
- ".inst 0x44b11f55 // sudot z21.s, z26.b, z1.b[2]\n"
- ".inst 0x44b01f32 // sudot z18.s, z25.b, z0.b[2]\n"
- ".inst 0x44b11f36 // sudot z22.s, z25.b, z1.b[2]\n"
- ".inst 0x44b01f13 // sudot z19.s, z24.b, z0.b[2]\n"
- ".inst 0x44b11f17 // sudot z23.s, z24.b, z1.b[2]\n"
- "ble 24f\n"
- "ld1b { z27.b }, p2/Z, [x28]\n"
- "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b81f70 // sudot z16.s, z27.b, z0.b[3]\n"
- ".inst 0x44b91f74 // sudot z20.s, z27.b, z1.b[3]\n"
- ".inst 0x44b81f51 // sudot z17.s, z26.b, z0.b[3]\n"
- ".inst 0x44b91f55 // sudot z21.s, z26.b, z1.b[3]\n"
- ".inst 0x44b81f32 // sudot z18.s, z25.b, z0.b[3]\n"
- ".inst 0x44b91f36 // sudot z22.s, z25.b, z1.b[3]\n"
- ".inst 0x44b81f13 // sudot z19.s, z24.b, z0.b[3]\n"
- ".inst 0x44b91f17 // sudot z23.s, z24.b, z1.b[3]\n"
- "24:" // Height 2: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 25f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z12.s, z1.b, z15.b\n"
- "25:" // Height 2: Multiply loop: unique 4: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 18b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "tbnz %x[flags], #31, 26f\n"
- "mov x21, #0x4\n"
- "add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z24.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z24.s, p2/M, z24.s\n"
- "saddv d11, p0, z11.s\n"
- "saddv d12, p0, z12.s\n"
- "mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z24.s\n"
- "mov z12.s, z12.s[0]\n"
- "mul z12.s, p2/M, z12.s, z24.s\n"
- "26:" // Height 2: skip row sum fixup
- "add z16.s, z16.s, z11.s\n"
- "add z17.s, z17.s, z11.s\n"
- "ld1w { z28.s }, p2/Z, [x10]\n"
- "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z18.s, z18.s, z11.s\n"
- "add z19.s, z19.s, z11.s\n"
- "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add z20.s, z20.s, z12.s\n"
- "add z21.s, z21.s, z12.s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z22.s, z22.s, z12.s\n"
- "add z23.s, z23.s, z12.s\n"
- "ld1rw { z24.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z16.s, z16.s, z28.s\n"
- "add z17.s, z17.s, z27.s\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z26.s\n"
- "add z19.s, z19.s, z25.s\n"
- "add z20.s, z20.s, z28.s\n"
- "add z21.s, z21.s, z27.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z26.s\n"
- "add z23.s, z23.s, z25.s\n"
- ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
- ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
- ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
- ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
- ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n"
- ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
- ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n"
- ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
- "tbz %x[flags], #5, 27f\n"
- "and z24.d, z16.d, z0.d\n"
- "and z30.d, z17.d, z0.d\n"
- "and z29.d, z18.d, z0.d\n"
- "and z28.d, z19.d, z0.d\n"
- "and z27.d, z20.d, z0.d\n"
- "and z26.d, z21.d, z0.d\n"
- "asr z24.s, z24.s, #0x1f\n"
- "and z25.d, z22.d, z0.d\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z29.s, z29.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "asr z27.s, z27.s, #0x1f\n"
- "sqadd z16.s, z16.s, z24.s\n"
- "and z24.d, z23.d, z0.d\n"
- "asr z26.s, z26.s, #0x1f\n"
- "asr z25.s, z25.s, #0x1f\n"
- "sqadd z17.s, z17.s, z30.s\n"
- "sqadd z18.s, z18.s, z29.s\n"
- "asr z24.s, z24.s, #0x1f\n"
- "sqadd z19.s, z19.s, z28.s\n"
- "sqadd z20.s, z20.s, z27.s\n"
- "sqadd z21.s, z21.s, z26.s\n"
- "sqadd z22.s, z22.s, z25.s\n"
- "sqadd z23.s, z23.s, z24.s\n"
- "27:" // Height 2: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z26.s\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z26.s\n"
- "add z18.s, z18.s, z26.s\n"
- "ld1rw { z25.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z26.s\n"
- "add z20.s, z20.s, z26.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z21.s, z21.s, z26.s\n"
- "add z22.s, z22.s, z26.s\n"
- "ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z26.s\n"
- "smin z16.s, p2/M, z16.s, z25.s\n"
- "smin z17.s, p2/M, z17.s, z25.s\n"
- "smin z18.s, p2/M, z18.s, z25.s\n"
- "smin z19.s, p2/M, z19.s, z25.s\n"
- "smin z20.s, p2/M, z20.s, z25.s\n"
- "smin z21.s, p2/M, z21.s, z25.s\n"
- "smin z22.s, p2/M, z22.s, z25.s\n"
- "smin z23.s, p2/M, z23.s, z25.s\n"
- "smax z16.s, p2/M, z16.s, z24.s\n"
- "smax z17.s, p2/M, z17.s, z24.s\n"
- "smax z18.s, p2/M, z18.s, z24.s\n"
- "smax z19.s, p2/M, z19.s, z24.s\n"
- "smax z20.s, p2/M, z20.s, z24.s\n"
- "smax z21.s, p2/M, z21.s, z24.s\n"
- "smax z22.s, p2/M, z22.s, z24.s\n"
- "smax z23.s, p2/M, z23.s, z24.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z18.h, z18.h, z19.h\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z17.h, z22.h, z23.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z20.b, z20.b, z17.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "st1b { z20.b }, p1, [x24]\n"
- "28:" // Height 2: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 16b\n"
- "b 58f\n"
- "29:" // Height 3
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "mov z13.s, #0x0\n"
- "mov z15.b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "30:" // Height 3: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "31:" // Height 3: setup done
- "mov x26, #0x0\n"
- "32:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 33f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x26, 34f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 34f\n"
- "33:" // Height 3: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "34:" // Height 3: input setup done
- "cmp x25, #0x10\n"
- "ble 37f\n"
- "35:" // Height 3: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x44a01cb0 // sudot z16.s, z5.b, z0.b[0]\n"
- ".inst 0x44a11cb4 // sudot z20.s, z5.b, z1.b[0]\n"
- ".inst 0x44a01fb1 // sudot z17.s, z29.b, z0.b[0]\n"
- ".inst 0x44a11fb5 // sudot z21.s, z29.b, z1.b[0]\n"
- ".inst 0x44a01c92 // sudot z18.s, z4.b, z0.b[0]\n"
- ".inst 0x44a21cb8 // sudot z24.s, z5.b, z2.b[0]\n"
- ".inst 0x44a21fb9 // sudot z25.s, z29.b, z2.b[0]\n"
- "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x44a11c96 // sudot z22.s, z4.b, z1.b[0]\n"
- ".inst 0x44a21c9a // sudot z26.s, z4.b, z2.b[0]\n"
- ".inst 0x44a01f93 // sudot z19.s, z28.b, z0.b[0]\n"
- ".inst 0x44a11f97 // sudot z23.s, z28.b, z1.b[0]\n"
- ".inst 0x44a21f9b // sudot z27.s, z28.b, z2.b[0]\n"
- ".inst 0x44a81c70 // sudot z16.s, z3.b, z0.b[1]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x44a91c74 // sudot z20.s, z3.b, z1.b[1]\n"
- ".inst 0x44aa1c78 // sudot z24.s, z3.b, z2.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x44a81ff1 // sudot z17.s, z31.b, z0.b[1]\n"
- ".inst 0x44a91ff5 // sudot z21.s, z31.b, z1.b[1]\n"
- ".inst 0x44aa1ff9 // sudot z25.s, z31.b, z2.b[1]\n"
- ".inst 0x44a81fd2 // sudot z18.s, z30.b, z0.b[1]\n"
- "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x44a91fd6 // sudot z22.s, z30.b, z1.b[1]\n"
- ".inst 0x44aa1fda // sudot z26.s, z30.b, z2.b[1]\n"
- "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x44a81fb3 // sudot z19.s, z29.b, z0.b[1]\n"
- ".inst 0x44a91fb7 // sudot z23.s, z29.b, z1.b[1]\n"
- ".inst 0x44aa1fbb // sudot z27.s, z29.b, z2.b[1]\n"
- ".inst 0x44b01f90 // sudot z16.s, z28.b, z0.b[2]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x44b11f94 // sudot z20.s, z28.b, z1.b[2]\n"
- ".inst 0x44b21f98 // sudot z24.s, z28.b, z2.b[2]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x44b01cb1 // sudot z17.s, z5.b, z0.b[2]\n"
- ".inst 0x44b11cb5 // sudot z21.s, z5.b, z1.b[2]\n"
- ".inst 0x44b21cb9 // sudot z25.s, z5.b, z2.b[2]\n"
- ".inst 0x44b01c92 // sudot z18.s, z4.b, z0.b[2]\n"
- ".inst 0x44b11c96 // sudot z22.s, z4.b, z1.b[2]\n"
- ".inst 0x44b21c9a // sudot z26.s, z4.b, z2.b[2]\n"
- ".inst 0x44b01c73 // sudot z19.s, z3.b, z0.b[2]\n"
- ".inst 0x44b11c77 // sudot z23.s, z3.b, z1.b[2]\n"
- ".inst 0x44b21c7b // sudot z27.s, z3.b, z2.b[2]\n"
- ".inst 0x44b81ff0 // sudot z16.s, z31.b, z0.b[3]\n"
- ".inst 0x44b91ff4 // sudot z20.s, z31.b, z1.b[3]\n"
- ".inst 0x44ba1ff8 // sudot z24.s, z31.b, z2.b[3]\n"
- ".inst 0x44b81fd1 // sudot z17.s, z30.b, z0.b[3]\n"
- ".inst 0x44b91fd5 // sudot z21.s, z30.b, z1.b[3]\n"
- ".inst 0x44ba1fd9 // sudot z25.s, z30.b, z2.b[3]\n"
- ".inst 0x44b81fb2 // sudot z18.s, z29.b, z0.b[3]\n"
- ".inst 0x44b91fb6 // sudot z22.s, z29.b, z1.b[3]\n"
- ".inst 0x44ba1fba // sudot z26.s, z29.b, z2.b[3]\n"
- ".inst 0x44b81f93 // sudot z19.s, z28.b, z0.b[3]\n"
- ".inst 0x44b91f97 // sudot z23.s, z28.b, z1.b[3]\n"
- ".inst 0x44ba1f9b // sudot z27.s, z28.b, z2.b[3]\n"
- "tbnz %x[flags], #31, 36f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z12.s, z1.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "36:" // Height 3: Multiply loop: unique 5: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 35b\n"
- "37:" // Height 3: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- ".inst 0x44a01ff0 // sudot z16.s, z31.b, z0.b[0]\n"
- ".inst 0x44a11ff4 // sudot z20.s, z31.b, z1.b[0]\n"
- ".inst 0x44a01fd1 // sudot z17.s, z30.b, z0.b[0]\n"
- ".inst 0x44a11fd5 // sudot z21.s, z30.b, z1.b[0]\n"
- ".inst 0x44a01fb2 // sudot z18.s, z29.b, z0.b[0]\n"
- ".inst 0x44a11fb6 // sudot z22.s, z29.b, z1.b[0]\n"
- ".inst 0x44a21ff8 // sudot z24.s, z31.b, z2.b[0]\n"
- ".inst 0x44a21fd9 // sudot z25.s, z30.b, z2.b[0]\n"
- ".inst 0x44a21fba // sudot z26.s, z29.b, z2.b[0]\n"
- ".inst 0x44a01f93 // sudot z19.s, z28.b, z0.b[0]\n"
- ".inst 0x44a11f97 // sudot z23.s, z28.b, z1.b[0]\n"
- ".inst 0x44a21f9b // sudot z27.s, z28.b, z2.b[0]\n"
- "ble 38f\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44a81ff0 // sudot z16.s, z31.b, z0.b[1]\n"
- ".inst 0x44a91ff4 // sudot z20.s, z31.b, z1.b[1]\n"
- ".inst 0x44aa1ff8 // sudot z24.s, z31.b, z2.b[1]\n"
- ".inst 0x44a81fd1 // sudot z17.s, z30.b, z0.b[1]\n"
- ".inst 0x44a91fd5 // sudot z21.s, z30.b, z1.b[1]\n"
- ".inst 0x44aa1fd9 // sudot z25.s, z30.b, z2.b[1]\n"
- ".inst 0x44a81fb2 // sudot z18.s, z29.b, z0.b[1]\n"
- ".inst 0x44a91fb6 // sudot z22.s, z29.b, z1.b[1]\n"
- ".inst 0x44aa1fba // sudot z26.s, z29.b, z2.b[1]\n"
- ".inst 0x44a81f93 // sudot z19.s, z28.b, z0.b[1]\n"
- ".inst 0x44a91f97 // sudot z23.s, z28.b, z1.b[1]\n"
- ".inst 0x44aa1f9b // sudot z27.s, z28.b, z2.b[1]\n"
- "ble 38f\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b01ff0 // sudot z16.s, z31.b, z0.b[2]\n"
- ".inst 0x44b11ff4 // sudot z20.s, z31.b, z1.b[2]\n"
- ".inst 0x44b21ff8 // sudot z24.s, z31.b, z2.b[2]\n"
- ".inst 0x44b01fd1 // sudot z17.s, z30.b, z0.b[2]\n"
- ".inst 0x44b11fd5 // sudot z21.s, z30.b, z1.b[2]\n"
- ".inst 0x44b21fd9 // sudot z25.s, z30.b, z2.b[2]\n"
- ".inst 0x44b01fb2 // sudot z18.s, z29.b, z0.b[2]\n"
- ".inst 0x44b11fb6 // sudot z22.s, z29.b, z1.b[2]\n"
- ".inst 0x44b21fba // sudot z26.s, z29.b, z2.b[2]\n"
- ".inst 0x44b01f93 // sudot z19.s, z28.b, z0.b[2]\n"
- ".inst 0x44b11f97 // sudot z23.s, z28.b, z1.b[2]\n"
- ".inst 0x44b21f9b // sudot z27.s, z28.b, z2.b[2]\n"
- "ble 38f\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b81ff0 // sudot z16.s, z31.b, z0.b[3]\n"
- ".inst 0x44b91ff4 // sudot z20.s, z31.b, z1.b[3]\n"
- ".inst 0x44ba1ff8 // sudot z24.s, z31.b, z2.b[3]\n"
- ".inst 0x44b81fd1 // sudot z17.s, z30.b, z0.b[3]\n"
- ".inst 0x44b91fd5 // sudot z21.s, z30.b, z1.b[3]\n"
- ".inst 0x44ba1fd9 // sudot z25.s, z30.b, z2.b[3]\n"
- ".inst 0x44b81fb2 // sudot z18.s, z29.b, z0.b[3]\n"
- ".inst 0x44b91fb6 // sudot z22.s, z29.b, z1.b[3]\n"
- ".inst 0x44ba1fba // sudot z26.s, z29.b, z2.b[3]\n"
- ".inst 0x44b81f93 // sudot z19.s, z28.b, z0.b[3]\n"
- ".inst 0x44b91f97 // sudot z23.s, z28.b, z1.b[3]\n"
- ".inst 0x44ba1f9b // sudot z27.s, z28.b, z2.b[3]\n"
- "38:" // Height 3: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 39f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z12.s, z1.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "39:" // Height 3: Multiply loop: unique 6: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 32b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
- "tbnz %x[flags], #31, 40f\n"
- "mov x21, #0x4\n"
- "add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z28.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z28.s, p2/M, z28.s\n"
- "saddv d11, p0, z11.s\n"
- "saddv d12, p0, z12.s\n"
- "saddv d13, p0, z13.s\n"
- "mov z11.s, z11.s[0]\n"
- "mov z12.s, z12.s[0]\n"
- "mul z11.s, p2/M, z11.s, z28.s\n"
- "mul z12.s, p2/M, z12.s, z28.s\n"
- "mov z13.s, z13.s[0]\n"
- "mul z13.s, p2/M, z13.s, z28.s\n"
- "40:" // Height 3: skip row sum fixup
- "add z16.s, z16.s, z11.s\n"
- "add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z18.s, z18.s, z11.s\n"
- "add z19.s, z19.s, z11.s\n"
- "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add z20.s, z20.s, z12.s\n"
- "add z21.s, z21.s, z12.s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z22.s, z22.s, z12.s\n"
- "add z23.s, z23.s, z12.s\n"
- "ld1rw { z28.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "addvl x10, x10, #4\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z31.s\n"
- "add z18.s, z18.s, z30.s\n"
- "add z19.s, z19.s, z29.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z31.s\n"
- "add z22.s, z22.s, z30.s\n"
- "add z23.s, z23.s, z29.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z31.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z26.s, z26.s, z30.s\n"
- "add z27.s, z27.s, z29.s\n"
- ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n"
- ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n"
- ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n"
- ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n"
- ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n"
- ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n"
- ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n"
- ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n"
- ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n"
- ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n"
- ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n"
- ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n"
- "tbz %x[flags], #5, 41f\n"
- "and z1.d, z16.d, z0.d\n"
- "and z31.d, z17.d, z0.d\n"
- "and z30.d, z18.d, z0.d\n"
- "and z29.d, z19.d, z0.d\n"
- "and z28.d, z20.d, z0.d\n"
- "and z3.d, z21.d, z0.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- "asr z31.s, z31.s, #0x1f\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z29.s, z29.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "and z2.d, z22.d, z0.d\n"
- "sqadd z16.s, z16.s, z1.s\n"
- "sqadd z17.s, z17.s, z31.s\n"
- "sqadd z18.s, z18.s, z30.s\n"
- "sqadd z19.s, z19.s, z29.s\n"
- "sqadd z20.s, z20.s, z28.s\n"
- "and z1.d, z23.d, z0.d\n"
- "and z31.d, z24.d, z0.d\n"
- "and z30.d, z25.d, z0.d\n"
- "and z29.d, z26.d, z0.d\n"
- "and z28.d, z27.d, z0.d\n"
- "asr z3.s, z3.s, #0x1f\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "asr z31.s, z31.s, #0x1f\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z29.s, z29.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "sqadd z21.s, z21.s, z3.s\n"
- "sqadd z22.s, z22.s, z2.s\n"
- "sqadd z23.s, z23.s, z1.s\n"
- "sqadd z24.s, z24.s, z31.s\n"
- "sqadd z25.s, z25.s, z30.s\n"
- "sqadd z26.s, z26.s, z29.s\n"
- "sqadd z27.s, z27.s, z28.s\n"
- "41:" // Height 3: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z30.s }, p2/Z, [x20]\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z30.s\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z30.s\n"
- "add z18.s, z18.s, z30.s\n"
- ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
- ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z30.s\n"
- "add z20.s, z20.s, z30.s\n"
- ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "add z21.s, z21.s, z30.s\n"
- "add z22.s, z22.s, z30.s\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z30.s\n"
- "add z24.s, z24.s, z30.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z30.s\n"
- "add z26.s, z26.s, z30.s\n"
- "ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z30.s\n"
- "smin z16.s, p2/M, z16.s, z29.s\n"
- "smin z17.s, p2/M, z17.s, z29.s\n"
- "smin z18.s, p2/M, z18.s, z29.s\n"
- "smin z19.s, p2/M, z19.s, z29.s\n"
- "smin z20.s, p2/M, z20.s, z29.s\n"
- "smin z21.s, p2/M, z21.s, z29.s\n"
- "smin z22.s, p2/M, z22.s, z29.s\n"
- "smin z23.s, p2/M, z23.s, z29.s\n"
- "smin z24.s, p2/M, z24.s, z29.s\n"
- "smin z25.s, p2/M, z25.s, z29.s\n"
- "smin z26.s, p2/M, z26.s, z29.s\n"
- "smin z27.s, p2/M, z27.s, z29.s\n"
- "smax z16.s, p2/M, z16.s, z28.s\n"
- "smax z17.s, p2/M, z17.s, z28.s\n"
- "smax z18.s, p2/M, z18.s, z28.s\n"
- "smax z19.s, p2/M, z19.s, z28.s\n"
- "smax z20.s, p2/M, z20.s, z28.s\n"
- "smax z21.s, p2/M, z21.s, z28.s\n"
- "smax z22.s, p2/M, z22.s, z28.s\n"
- "smax z23.s, p2/M, z23.s, z28.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "smax z24.s, p2/M, z24.s, z28.s\n"
- "smax z25.s, p2/M, z25.s, z28.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "smax z26.s, p2/M, z26.s, z28.s\n"
- "smax z27.s, p2/M, z27.s, z28.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "uzp1 z18.h, z22.h, z23.h\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "uzp1 z20.b, z20.b, z18.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z20.b }, p1, [x24]\n"
- "st1b { z24.b }, p1, [x23]\n"
- "42:" // Height 3: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 30b\n"
- "b 58f\n"
- "43:" // Height 4
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x4\n"
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "mov z15.b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "44:" // Height 4: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "45:" // Height 4: setup done
- "mov x26, #0x0\n"
- "46:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 47f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "ldr x21, [x20, #0x18]\n"
- "cbnz x26, 48f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 48f\n"
- "47:" // Height 4: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "48:" // Height 4: input setup done
- "cmp x25, #0x10\n"
- "ble 51f\n"
- "49:" // Height 4: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1rqb { z3.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x44a01cb0 // sudot z16.s, z5.b, z0.b[0]\n"
- ".inst 0x44a11cb4 // sudot z20.s, z5.b, z1.b[0]\n"
- ".inst 0x44a01d51 // sudot z17.s, z10.b, z0.b[0]\n"
- ".inst 0x44a11d55 // sudot z21.s, z10.b, z1.b[0]\n"
- ".inst 0x44a21cb8 // sudot z24.s, z5.b, z2.b[0]\n"
- ".inst 0x44a31cbc // sudot z28.s, z5.b, z3.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x44a21d59 // sudot z25.s, z10.b, z2.b[0]\n"
- ".inst 0x44a31d5d // sudot z29.s, z10.b, z3.b[0]\n"
- ".inst 0x44a01c92 // sudot z18.s, z4.b, z0.b[0]\n"
- ".inst 0x44a11c96 // sudot z22.s, z4.b, z1.b[0]\n"
- ".inst 0x44a21c9a // sudot z26.s, z4.b, z2.b[0]\n"
- ".inst 0x44a31c9e // sudot z30.s, z4.b, z3.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x44a01d33 // sudot z19.s, z9.b, z0.b[0]\n"
- ".inst 0x44a11d37 // sudot z23.s, z9.b, z1.b[0]\n"
- ".inst 0x44a21d3b // sudot z27.s, z9.b, z2.b[0]\n"
- ".inst 0x44a31d3f // sudot z31.s, z9.b, z3.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n"
- ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n"
- ".inst 0x44aa1d18 // sudot z24.s, z8.b, z2.b[1]\n"
- ".inst 0x44ab1d1c // sudot z28.s, z8.b, z3.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x44a81cf1 // sudot z17.s, z7.b, z0.b[1]\n"
- ".inst 0x44a91cf5 // sudot z21.s, z7.b, z1.b[1]\n"
- ".inst 0x44aa1cf9 // sudot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x44ab1cfd // sudot z29.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x44a81cd2 // sudot z18.s, z6.b, z0.b[1]\n"
- ".inst 0x44a91cd6 // sudot z22.s, z6.b, z1.b[1]\n"
- ".inst 0x44aa1cda // sudot z26.s, z6.b, z2.b[1]\n"
- ".inst 0x44ab1cde // sudot z30.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x44a81cb3 // sudot z19.s, z5.b, z0.b[1]\n"
- ".inst 0x44a91cb7 // sudot z23.s, z5.b, z1.b[1]\n"
- ".inst 0x44aa1cbb // sudot z27.s, z5.b, z2.b[1]\n"
- ".inst 0x44ab1cbf // sudot z31.s, z5.b, z3.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x44b01c90 // sudot z16.s, z4.b, z0.b[2]\n"
- ".inst 0x44b11c94 // sudot z20.s, z4.b, z1.b[2]\n"
- ".inst 0x44b21c98 // sudot z24.s, z4.b, z2.b[2]\n"
- ".inst 0x44b31c9c // sudot z28.s, z4.b, z3.b[2]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x44b01d51 // sudot z17.s, z10.b, z0.b[2]\n"
- ".inst 0x44b11d55 // sudot z21.s, z10.b, z1.b[2]\n"
- ".inst 0x44b21d59 // sudot z25.s, z10.b, z2.b[2]\n"
- ".inst 0x44b31d5d // sudot z29.s, z10.b, z3.b[2]\n"
- ".inst 0x44b01d32 // sudot z18.s, z9.b, z0.b[2]\n"
- ".inst 0x44b11d36 // sudot z22.s, z9.b, z1.b[2]\n"
- ".inst 0x44b21d3a // sudot z26.s, z9.b, z2.b[2]\n"
- ".inst 0x44b31d3e // sudot z30.s, z9.b, z3.b[2]\n"
- ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n"
- ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n"
- ".inst 0x44b21d1b // sudot z27.s, z8.b, z2.b[2]\n"
- ".inst 0x44b31d1f // sudot z31.s, z8.b, z3.b[2]\n"
- ".inst 0x44b81cf0 // sudot z16.s, z7.b, z0.b[3]\n"
- ".inst 0x44b91cf4 // sudot z20.s, z7.b, z1.b[3]\n"
- ".inst 0x44ba1cf8 // sudot z24.s, z7.b, z2.b[3]\n"
- ".inst 0x44bb1cfc // sudot z28.s, z7.b, z3.b[3]\n"
- ".inst 0x44b81cd1 // sudot z17.s, z6.b, z0.b[3]\n"
- ".inst 0x44b91cd5 // sudot z21.s, z6.b, z1.b[3]\n"
- ".inst 0x44ba1cd9 // sudot z25.s, z6.b, z2.b[3]\n"
- ".inst 0x44bb1cdd // sudot z29.s, z6.b, z3.b[3]\n"
- ".inst 0x44b81cb2 // sudot z18.s, z5.b, z0.b[3]\n"
- ".inst 0x44b91cb6 // sudot z22.s, z5.b, z1.b[3]\n"
- ".inst 0x44ba1cba // sudot z26.s, z5.b, z2.b[3]\n"
- ".inst 0x44bb1cbe // sudot z30.s, z5.b, z3.b[3]\n"
- ".inst 0x44b81c93 // sudot z19.s, z4.b, z0.b[3]\n"
- ".inst 0x44b91c97 // sudot z23.s, z4.b, z1.b[3]\n"
- ".inst 0x44ba1c9b // sudot z27.s, z4.b, z2.b[3]\n"
- ".inst 0x44bb1c9f // sudot z31.s, z4.b, z3.b[3]\n"
- "tbnz %x[flags], #31, 50f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z12.s, z1.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "udot z14.s, z3.b, z15.b\n"
- "50:" // Height 4: Multiply loop: unique 7: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 49b\n"
- "51:" // Height 4: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z7.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1rqb { z1.b }, p0/Z, [x23]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1rqb { z3.b }, p0/Z, [x21]\n"
- ".inst 0x44a01cf0 // sudot z16.s, z7.b, z0.b[0]\n"
- ".inst 0x44a11cf4 // sudot z20.s, z7.b, z1.b[0]\n"
- ".inst 0x44a01cd1 // sudot z17.s, z6.b, z0.b[0]\n"
- ".inst 0x44a11cd5 // sudot z21.s, z6.b, z1.b[0]\n"
- ".inst 0x44a01cb2 // sudot z18.s, z5.b, z0.b[0]\n"
- ".inst 0x44a11cb6 // sudot z22.s, z5.b, z1.b[0]\n"
- ".inst 0x44a21cf8 // sudot z24.s, z7.b, z2.b[0]\n"
- ".inst 0x44a31cfc // sudot z28.s, z7.b, z3.b[0]\n"
- ".inst 0x44a21cd9 // sudot z25.s, z6.b, z2.b[0]\n"
- ".inst 0x44a31cdd // sudot z29.s, z6.b, z3.b[0]\n"
- ".inst 0x44a21cba // sudot z26.s, z5.b, z2.b[0]\n"
- ".inst 0x44a31cbe // sudot z30.s, z5.b, z3.b[0]\n"
- ".inst 0x44a01c93 // sudot z19.s, z4.b, z0.b[0]\n"
- ".inst 0x44a11c97 // sudot z23.s, z4.b, z1.b[0]\n"
- ".inst 0x44a21c9b // sudot z27.s, z4.b, z2.b[0]\n"
- ".inst 0x44a31c9f // sudot z31.s, z4.b, z3.b[0]\n"
- "ble 52f\n"
- "ld1b { z7.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44a81cf0 // sudot z16.s, z7.b, z0.b[1]\n"
- ".inst 0x44a91cf4 // sudot z20.s, z7.b, z1.b[1]\n"
- ".inst 0x44aa1cf8 // sudot z24.s, z7.b, z2.b[1]\n"
- ".inst 0x44ab1cfc // sudot z28.s, z7.b, z3.b[1]\n"
- ".inst 0x44a81cd1 // sudot z17.s, z6.b, z0.b[1]\n"
- ".inst 0x44a91cd5 // sudot z21.s, z6.b, z1.b[1]\n"
- ".inst 0x44aa1cd9 // sudot z25.s, z6.b, z2.b[1]\n"
- ".inst 0x44ab1cdd // sudot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x44a81cb2 // sudot z18.s, z5.b, z0.b[1]\n"
- ".inst 0x44a91cb6 // sudot z22.s, z5.b, z1.b[1]\n"
- ".inst 0x44aa1cba // sudot z26.s, z5.b, z2.b[1]\n"
- ".inst 0x44ab1cbe // sudot z30.s, z5.b, z3.b[1]\n"
- ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n"
- ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n"
- ".inst 0x44aa1c9b // sudot z27.s, z4.b, z2.b[1]\n"
- ".inst 0x44ab1c9f // sudot z31.s, z4.b, z3.b[1]\n"
- "ble 52f\n"
- "ld1b { z7.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b01cf0 // sudot z16.s, z7.b, z0.b[2]\n"
- ".inst 0x44b11cf4 // sudot z20.s, z7.b, z1.b[2]\n"
- ".inst 0x44b21cf8 // sudot z24.s, z7.b, z2.b[2]\n"
- ".inst 0x44b31cfc // sudot z28.s, z7.b, z3.b[2]\n"
- ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n"
- ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n"
- ".inst 0x44b21cd9 // sudot z25.s, z6.b, z2.b[2]\n"
- ".inst 0x44b31cdd // sudot z29.s, z6.b, z3.b[2]\n"
- ".inst 0x44b01cb2 // sudot z18.s, z5.b, z0.b[2]\n"
- ".inst 0x44b11cb6 // sudot z22.s, z5.b, z1.b[2]\n"
- ".inst 0x44b21cba // sudot z26.s, z5.b, z2.b[2]\n"
- ".inst 0x44b31cbe // sudot z30.s, z5.b, z3.b[2]\n"
- ".inst 0x44b01c93 // sudot z19.s, z4.b, z0.b[2]\n"
- ".inst 0x44b11c97 // sudot z23.s, z4.b, z1.b[2]\n"
- ".inst 0x44b21c9b // sudot z27.s, z4.b, z2.b[2]\n"
- ".inst 0x44b31c9f // sudot z31.s, z4.b, z3.b[2]\n"
- "ble 52f\n"
- "ld1b { z7.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- ".inst 0x44b81cf0 // sudot z16.s, z7.b, z0.b[3]\n"
- ".inst 0x44b91cf4 // sudot z20.s, z7.b, z1.b[3]\n"
- ".inst 0x44ba1cf8 // sudot z24.s, z7.b, z2.b[3]\n"
- ".inst 0x44bb1cfc // sudot z28.s, z7.b, z3.b[3]\n"
- ".inst 0x44b81cd1 // sudot z17.s, z6.b, z0.b[3]\n"
- ".inst 0x44b91cd5 // sudot z21.s, z6.b, z1.b[3]\n"
- ".inst 0x44ba1cd9 // sudot z25.s, z6.b, z2.b[3]\n"
- ".inst 0x44bb1cdd // sudot z29.s, z6.b, z3.b[3]\n"
- ".inst 0x44b81cb2 // sudot z18.s, z5.b, z0.b[3]\n"
- ".inst 0x44b91cb6 // sudot z22.s, z5.b, z1.b[3]\n"
- ".inst 0x44ba1cba // sudot z26.s, z5.b, z2.b[3]\n"
- ".inst 0x44bb1cbe // sudot z30.s, z5.b, z3.b[3]\n"
- ".inst 0x44b81c93 // sudot z19.s, z4.b, z0.b[3]\n"
- ".inst 0x44b91c97 // sudot z23.s, z4.b, z1.b[3]\n"
- ".inst 0x44ba1c9b // sudot z27.s, z4.b, z2.b[3]\n"
- ".inst 0x44bb1c9f // sudot z31.s, z4.b, z3.b[3]\n"
- "52:" // Height 4: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 53f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z12.s, z1.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "udot z14.s, z3.b, z15.b\n"
- "53:" // Height 4: Multiply loop: unique 8: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 46b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x27, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "tbnz %x[flags], #31, 54f\n"
- "mov x21, #0x4\n"
- "add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "whilelt p0.s, XZR, x21\n"
- "neg z0.s, p2/M, z0.s\n"
- "saddv d11, p0, z11.s\n"
- "saddv d12, p0, z12.s\n"
- "saddv d13, p0, z13.s\n"
- "saddv d14, p0, z14.s\n"
- "mov z11.s, z11.s[0]\n"
- "mov z12.s, z12.s[0]\n"
- "mul z11.s, p2/M, z11.s, z0.s\n"
- "mul z12.s, p2/M, z12.s, z0.s\n"
- "mov z13.s, z13.s[0]\n"
- "mov z14.s, z14.s[0]\n"
- "mul z13.s, p2/M, z13.s, z0.s\n"
- "mul z14.s, p2/M, z14.s, z0.s\n"
- "54:" // Height 4: skip row sum fixup
- "add z16.s, z16.s, z11.s\n"
- "add z17.s, z17.s, z11.s\n"
- "ld1w { z4.s }, p2/Z, [x10]\n"
- "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z18.s, z18.s, z11.s\n"
- "add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add z20.s, z20.s, z12.s\n"
- "add z21.s, z21.s, z12.s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z22.s, z22.s, z12.s\n"
- "add z23.s, z23.s, z12.s\n"
- "ld1rw { z1.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "addvl x10, x10, #4\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
- "add z16.s, z16.s, z4.s\n"
- "add z17.s, z17.s, z0.s\n"
- "add z18.s, z18.s, z3.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z0.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z23.s, z23.s, z2.s\n"
- "add z24.s, z24.s, z4.s\n"
- "add z25.s, z25.s, z0.s\n"
- "add z26.s, z26.s, z3.s\n"
- "add z27.s, z27.s, z2.s\n"
- "add z28.s, z28.s, z4.s\n"
- "add z29.s, z29.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z3.s\n"
- "add z31.s, z31.s, z2.s\n"
- ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n"
- ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n"
- ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n"
- ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n"
- ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n"
- ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n"
- ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n"
- ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
- ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n"
- ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n"
- ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
- ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
- ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
- ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n"
- ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n"
- ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n"
- "tbz %x[flags], #5, 55f\n"
- "and z2.d, z16.d, z0.d\n"
- "and z1.d, z17.d, z0.d\n"
- "and z7.d, z18.d, z0.d\n"
- "and z6.d, z19.d, z0.d\n"
- "and z5.d, z20.d, z0.d\n"
- "and z4.d, z21.d, z0.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "and z3.d, z22.d, z0.d\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z17.s, z17.s, z1.s\n"
- "and z2.d, z23.d, z0.d\n"
- "and z1.d, z24.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z18.s, z18.s, z7.s\n"
- "sqadd z19.s, z19.s, z6.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z4.s\n"
- "sqadd z22.s, z22.s, z3.s\n"
- "and z7.d, z25.d, z0.d\n"
- "sqadd z23.s, z23.s, z2.s\n"
- "sqadd z24.s, z24.s, z1.s\n"
- "and z6.d, z26.d, z0.d\n"
- "and z5.d, z27.d, z0.d\n"
- "and z4.d, z28.d, z0.d\n"
- "and z3.d, z29.d, z0.d\n"
- "and z2.d, z30.d, z0.d\n"
- "and z1.d, z31.d, z0.d\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z3.s, z3.s, #0x1f\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z25.s, z25.s, z7.s\n"
- "sqadd z26.s, z26.s, z6.s\n"
- "sqadd z27.s, z27.s, z5.s\n"
- "sqadd z28.s, z28.s, z4.s\n"
- "sqadd z29.s, z29.s, z3.s\n"
- "sqadd z30.s, z30.s, z2.s\n"
- "sqadd z31.s, z31.s, z1.s\n"
- "55:" // Height 4: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "ld1rw { z2.s }, p2/Z, [x20]\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z16.s, z16.s, z2.s\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z2.s\n"
- "add z18.s, z18.s, z2.s\n"
- ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
- ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z20.s, z20.s, z2.s\n"
- ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z2.s\n"
- ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
- ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
- "add z23.s, z23.s, z2.s\n"
- "add z24.s, z24.s, z2.s\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "add z25.s, z25.s, z2.s\n"
- "add z26.s, z26.s, z2.s\n"
- "ld1rw { z1.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z2.s\n"
- "add z28.s, z28.s, z2.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z29.s, z29.s, z2.s\n"
- "add z30.s, z30.s, z2.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z31.s, z31.s, z2.s\n"
- "smin z16.s, p2/M, z16.s, z1.s\n"
- "smin z17.s, p2/M, z17.s, z1.s\n"
- "smin z18.s, p2/M, z18.s, z1.s\n"
- "smin z19.s, p2/M, z19.s, z1.s\n"
- "smin z20.s, p2/M, z20.s, z1.s\n"
- "smin z21.s, p2/M, z21.s, z1.s\n"
- "smin z22.s, p2/M, z22.s, z1.s\n"
- "smin z23.s, p2/M, z23.s, z1.s\n"
- "smin z24.s, p2/M, z24.s, z1.s\n"
- "smin z25.s, p2/M, z25.s, z1.s\n"
- "smin z26.s, p2/M, z26.s, z1.s\n"
- "smin z27.s, p2/M, z27.s, z1.s\n"
- "smin z28.s, p2/M, z28.s, z1.s\n"
- "smin z29.s, p2/M, z29.s, z1.s\n"
- "smin z30.s, p2/M, z30.s, z1.s\n"
- "smin z31.s, p2/M, z31.s, z1.s\n"
- "smax z16.s, p2/M, z16.s, z0.s\n"
- "smax z17.s, p2/M, z17.s, z0.s\n"
- "smax z18.s, p2/M, z18.s, z0.s\n"
- "smax z19.s, p2/M, z19.s, z0.s\n"
- "smax z20.s, p2/M, z20.s, z0.s\n"
- "smax z21.s, p2/M, z21.s, z0.s\n"
- "smax z22.s, p2/M, z22.s, z0.s\n"
- "smax z23.s, p2/M, z23.s, z0.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "smax z24.s, p2/M, z24.s, z0.s\n"
- "smax z25.s, p2/M, z25.s, z0.s\n"
- "uzp1 z18.h, z18.h, z19.h\n"
- "smax z26.s, p2/M, z26.s, z0.s\n"
- "smax z27.s, p2/M, z27.s, z0.s\n"
- "uzp1 z20.h, z20.h, z21.h\n"
- "smax z28.s, p2/M, z28.s, z0.s\n"
- "smax z29.s, p2/M, z29.s, z0.s\n"
- "uzp1 z17.h, z22.h, z23.h\n"
- "smax z30.s, p2/M, z30.s, z0.s\n"
- "smax z31.s, p2/M, z31.s, z0.s\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "uzp1 z18.h, z26.h, z27.h\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z20.b, z20.b, z17.b\n"
- "uzp1 z17.h, z30.h, z31.h\n"
- "st1b { z16.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z18.b\n"
- "uzp1 z28.b, z28.b, z17.b\n"
- "st1b { z20.b }, p1, [x24]\n"
- "st1b { z24.b }, p1, [x23]\n"
- "st1b { z28.b }, p1, [x22]\n"
- "56:" // Height 4: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 44b\n"
- "subs %x[M], %x[M], #0x4\n"
- "beq 58f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 57f\n"
- "add x21, x21, #0x4\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "57:" // Update direct input
- "mov x20, #0x4\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "58:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL.hpp
deleted file mode 100644
index b073731751..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../std_transforms_sve.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<uint8_t>, \
- const Requantize32 *, const int32_t *, unsigned int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void sve_hybrid_u8s8qa_mmla_4x4VL( ARGLIST );
-
-class cls_sve_hybrid_u8s8qa_mmla_4x4VL
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef uint8_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<int32_t>() * 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 8;
- }
-
- static constexpr bool supports_accumulate()
- {
- return false;
- }
-
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 4, 8, 8> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 47.37 };
- case CPUModel::A510:
- return { 20.88 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_u8s8qa_mmla_4x4VL;
- cls_sve_hybrid_u8s8qa_mmla_4x4VL(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp
deleted file mode 100644
index 01bdac2967..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp
+++ /dev/null
@@ -1,1418 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void sve_hybrid_u8s8qa_mmla_4x4VL (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
- const Requantize32 *qp, const int32_t *col_bias, unsigned int
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- if (qp->c_offset > qp->minval) {
- flags |= 0x20;
- }
- __asm__ __volatile__(
- "ptrue p2.b\n"
- "1:" // Row loop
- "cmp %x[M], #0x4\n"
- "bge 43f\n"
- "cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z15.b, #0x1\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "3:" // Height 1: setup done
- "mov x26, #0x0\n"
- "4:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 5f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "cbnz x26, 6f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "b 6f\n"
- "5:" // Height 1: setup direct input
- "mov x24, %x[input_ptr]\n"
- "6:" // Height 1: input setup done
- "cmp x25, #0x10\n"
- "ble 9f\n"
- "7:" // Height 1: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z30.b }, p2/Z, [x28]\n"
- "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "trn1 z0.d, z1.d, z31.d\n"
- ".inst 0x459e9810 // usmmla z16.s, z0.b, z30.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- "trn2 z1.d, z1.d, z31.d\n"
- ".inst 0x459d9814 // usmmla z20.s, z0.b, z29.b\n"
- ".inst 0x459c9811 // usmmla z17.s, z0.b, z28.b\n"
- ".inst 0x459b9815 // usmmla z21.s, z0.b, z27.b\n"
- ".inst 0x459a9812 // usmmla z18.s, z0.b, z26.b\n"
- "ld1b { z31.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45999816 // usmmla z22.s, z0.b, z25.b\n"
- ".inst 0x45989813 // usmmla z19.s, z0.b, z24.b\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x45889817 // usmmla z23.s, z0.b, z8.b\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x459f9830 // usmmla z16.s, z1.b, z31.b\n"
- "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n"
- ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n"
- ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n"
- ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n"
- ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n"
- ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n"
- ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n"
- "tbnz %x[flags], #31, 8f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "8:" // Height 1: Multiply loop: unique 1: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 7b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z31.d\n"
- ".inst 0x45989810 // usmmla z16.s, z0.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #8\n"
- "trn2 z1.d, z1.d, z31.d\n"
- ".inst 0x459e9814 // usmmla z20.s, z0.b, z30.b\n"
- ".inst 0x459d9811 // usmmla z17.s, z0.b, z29.b\n"
- ".inst 0x459c9815 // usmmla z21.s, z0.b, z28.b\n"
- ".inst 0x459b9812 // usmmla z18.s, z0.b, z27.b\n"
- ".inst 0x459a9816 // usmmla z22.s, z0.b, z26.b\n"
- ".inst 0x45999813 // usmmla z19.s, z0.b, z25.b\n"
- ".inst 0x45989817 // usmmla z23.s, z0.b, z24.b\n"
- "ble 10f\n"
- "ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45989830 // usmmla z16.s, z1.b, z24.b\n"
- "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n"
- ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n"
- ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n"
- ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n"
- ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n"
- ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n"
- "10:" // Height 1: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 11f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "11:" // Height 1: Multiply loop: unique 2: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 4b\n"
- "uzp1 z16.d, z16.d, z20.d\n"
- "uzp1 z17.d, z17.d, z21.d\n"
- "uzp1 z18.d, z18.d, z22.d\n"
- "uzp1 z19.d, z19.d, z23.d\n"
- "mov z23.d, z16.d\n"
- "tbnz %x[flags], #31, 12f\n"
- "add x20, %x[qp], %[b_offset]\n"
- ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "ld1rw { z9.s }, p2/Z, [x20]\n"
- "neg z9.s, p2/M, z9.s\n"
- "mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z9.s\n"
- "12:" // Height 1: skip row sum fixup
- "add z23.s, z23.s, z11.s\n"
- "add z17.s, z17.s, z11.s\n"
- "ld1w { z22.s }, p2/Z, [x10]\n"
- "ld1w { z24.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z18.s, z18.s, z11.s\n"
- "add z19.s, z19.s, z11.s\n"
- "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z23.s, z23.s, z22.s\n"
- "add z17.s, z17.s, z24.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "addvl x10, x10, #4\n"
- "add z18.s, z18.s, z21.s\n"
- "add z19.s, z19.s, z20.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
- ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n"
- ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
- ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
- "tbz %x[flags], #5, 13f\n"
- "and z22.d, z23.d, z0.d\n"
- "and z21.d, z17.d, z0.d\n"
- "and z20.d, z18.d, z0.d\n"
- "and z16.d, z19.d, z0.d\n"
- "asr z22.s, z22.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z23.s, z23.s, z22.s\n"
- "sqadd z17.s, z17.s, z21.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- "sqadd z19.s, z19.s, z16.s\n"
- "13:" // Height 1: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z21.s }, p2/Z, [x20]\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "ld1rw { z20.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z21.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z21.s\n"
- "add z18.s, z18.s, z21.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z21.s\n"
- "smin z23.s, p2/M, z23.s, z20.s\n"
- "smin z17.s, p2/M, z17.s, z20.s\n"
- "smin z18.s, p2/M, z18.s, z20.s\n"
- "smin z19.s, p2/M, z19.s, z20.s\n"
- "smax z23.s, p2/M, z23.s, z16.s\n"
- "smax z17.s, p2/M, z17.s, z16.s\n"
- "smax z18.s, p2/M, z18.s, z16.s\n"
- "smax z19.s, p2/M, z19.s, z16.s\n"
- "uzp1 z23.h, z23.h, z17.h\n"
- "uzp1 z16.h, z18.h, z19.h\n"
- "uzp1 z23.b, z23.b, z16.b\n"
- "st1b { z23.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "14:" // Height 1: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 2b\n"
- "b 58f\n"
- "15:" // Height 2
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "mov z15.b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "16:" // Height 2: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "17:" // Height 2: setup done
- "mov x26, #0x0\n"
- "18:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 19f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "cbnz x26, 20f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 20f\n"
- "19:" // Height 2: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "20:" // Height 2: input setup done
- "cmp x25, #0x10\n"
- "ble 23f\n"
- "21:" // Height 2: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z31.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z25.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- ".inst 0x459f9810 // usmmla z16.s, z0.b, z31.b\n"
- "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x459e9814 // usmmla z20.s, z0.b, z30.b\n"
- ".inst 0x459d9811 // usmmla z17.s, z0.b, z29.b\n"
- ".inst 0x459c9815 // usmmla z21.s, z0.b, z28.b\n"
- ".inst 0x459b9812 // usmmla z18.s, z0.b, z27.b\n"
- ".inst 0x459a9816 // usmmla z22.s, z0.b, z26.b\n"
- ".inst 0x45989813 // usmmla z19.s, z0.b, z24.b\n"
- "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45999817 // usmmla z23.s, z0.b, z25.b\n"
- "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x45989830 // usmmla z16.s, z1.b, z24.b\n"
- "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n"
- ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n"
- ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n"
- ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n"
- ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n"
- ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n"
- ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n"
- "tbnz %x[flags], #31, 22f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "22:" // Height 2: Multiply loop: unique 3: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 21b\n"
- "23:" // Height 2: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z29.b }, p2/Z, [x28]\n"
- "ld1b { z28.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "trn1 z0.d, z1.d, z24.d\n"
- "trn2 z1.d, z1.d, z24.d\n"
- ".inst 0x459d9810 // usmmla z16.s, z0.b, z29.b\n"
- "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #8\n"
- ".inst 0x459c9814 // usmmla z20.s, z0.b, z28.b\n"
- ".inst 0x45849811 // usmmla z17.s, z0.b, z4.b\n"
- ".inst 0x459b9815 // usmmla z21.s, z0.b, z27.b\n"
- ".inst 0x459a9812 // usmmla z18.s, z0.b, z26.b\n"
- ".inst 0x45869816 // usmmla z22.s, z0.b, z6.b\n"
- ".inst 0x45999813 // usmmla z19.s, z0.b, z25.b\n"
- ".inst 0x45989817 // usmmla z23.s, z0.b, z24.b\n"
- "ble 24f\n"
- "ld1b { z24.b }, p2/Z, [x28]\n"
- "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45989830 // usmmla z16.s, z1.b, z24.b\n"
- "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n"
- ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n"
- ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n"
- ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n"
- ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n"
- ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n"
- "24:" // Height 2: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 25f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "25:" // Height 2: Multiply loop: unique 4: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 18b\n"
- "uzp1 z24.d, z16.d, z20.d\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "uzp2 z17.d, z17.d, z21.d\n"
- "uzp1 z21.d, z18.d, z22.d\n"
- "uzp2 z18.d, z18.d, z22.d\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "add x23, x27, x20\n"
- "mov z23.d, z24.d\n"
- "tbnz %x[flags], #31, 26f\n"
- "add x20, %x[qp], %[b_offset]\n"
- ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "ld1rw { z24.s }, p2/Z, [x20]\n"
- "neg z24.s, p2/M, z24.s\n"
- "mov z12.s, z11.s[3]\n"
- "mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z24.s\n"
- "mul z12.s, p2/M, z12.s, z24.s\n"
- "26:" // Height 2: skip row sum fixup
- "add z23.s, z23.s, z11.s\n"
- "add z20.s, z20.s, z11.s\n"
- "ld1w { z28.s }, p2/Z, [x10]\n"
- "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z21.s, z21.s, z11.s\n"
- "add z22.s, z22.s, z11.s\n"
- "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add z16.s, z16.s, z12.s\n"
- "add z17.s, z17.s, z12.s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z18.s, z18.s, z12.s\n"
- "add z19.s, z19.s, z12.s\n"
- "ld1rw { z24.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z23.s, z23.s, z28.s\n"
- "add z20.s, z20.s, z27.s\n"
- "addvl x10, x10, #4\n"
- "add z21.s, z21.s, z26.s\n"
- "add z22.s, z22.s, z25.s\n"
- "add z16.s, z16.s, z28.s\n"
- "add z17.s, z17.s, z27.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z18.s, z18.s, z26.s\n"
- "add z19.s, z19.s, z25.s\n"
- ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
- ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n"
- ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
- ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n"
- ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
- ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
- ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
- ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
- "tbz %x[flags], #5, 27f\n"
- "and z24.d, z23.d, z0.d\n"
- "and z30.d, z20.d, z0.d\n"
- "and z29.d, z21.d, z0.d\n"
- "and z28.d, z22.d, z0.d\n"
- "and z27.d, z16.d, z0.d\n"
- "and z26.d, z17.d, z0.d\n"
- "asr z24.s, z24.s, #0x1f\n"
- "and z25.d, z18.d, z0.d\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z29.s, z29.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "asr z27.s, z27.s, #0x1f\n"
- "sqadd z23.s, z23.s, z24.s\n"
- "and z24.d, z19.d, z0.d\n"
- "asr z26.s, z26.s, #0x1f\n"
- "asr z25.s, z25.s, #0x1f\n"
- "sqadd z20.s, z20.s, z30.s\n"
- "sqadd z21.s, z21.s, z29.s\n"
- "asr z24.s, z24.s, #0x1f\n"
- "sqadd z22.s, z22.s, z28.s\n"
- "sqadd z16.s, z16.s, z27.s\n"
- "sqadd z17.s, z17.s, z26.s\n"
- "sqadd z18.s, z18.s, z25.s\n"
- "sqadd z19.s, z19.s, z24.s\n"
- "27:" // Height 2: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "ld1rw { z26.s }, p2/Z, [x20]\n"
- ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z23.s, z23.s, z26.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "add z20.s, z20.s, z26.s\n"
- "add z21.s, z21.s, z26.s\n"
- "ld1rw { z25.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z26.s\n"
- "add z16.s, z16.s, z26.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z17.s, z17.s, z26.s\n"
- "add z18.s, z18.s, z26.s\n"
- "ld1rw { z24.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z26.s\n"
- "smin z23.s, p2/M, z23.s, z25.s\n"
- "smin z20.s, p2/M, z20.s, z25.s\n"
- "smin z21.s, p2/M, z21.s, z25.s\n"
- "smin z22.s, p2/M, z22.s, z25.s\n"
- "smin z16.s, p2/M, z16.s, z25.s\n"
- "smin z17.s, p2/M, z17.s, z25.s\n"
- "smin z18.s, p2/M, z18.s, z25.s\n"
- "smin z19.s, p2/M, z19.s, z25.s\n"
- "smax z23.s, p2/M, z23.s, z24.s\n"
- "smax z20.s, p2/M, z20.s, z24.s\n"
- "smax z21.s, p2/M, z21.s, z24.s\n"
- "smax z22.s, p2/M, z22.s, z24.s\n"
- "smax z16.s, p2/M, z16.s, z24.s\n"
- "smax z17.s, p2/M, z17.s, z24.s\n"
- "smax z18.s, p2/M, z18.s, z24.s\n"
- "smax z19.s, p2/M, z19.s, z24.s\n"
- "uzp1 z23.h, z23.h, z20.h\n"
- "uzp1 z20.h, z21.h, z22.h\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z23.b, z23.b, z20.b\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z23.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "st1b { z16.b }, p1, [x23]\n"
- "28:" // Height 2: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 16b\n"
- "b 58f\n"
- "29:" // Height 3
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "mov z13.s, #0x0\n"
- "mov z15.b, #0x1\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "30:" // Height 3: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "31:" // Height 3: setup done
- "mov x26, #0x0\n"
- "32:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 33f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x26, 34f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 34f\n"
- "33:" // Height 3: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "34:" // Height 3: input setup done
- "cmp x25, #0x10\n"
- "ble 37f\n"
- "35:" // Height 3: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x22, x22, #0x10\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z6.d\n"
- "trn2 z3.d, z3.d, z6.d\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n"
- ".inst 0x458a9814 // usmmla z20.s, z0.b, z10.b\n"
- ".inst 0x45899811 // usmmla z17.s, z0.b, z9.b\n"
- ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n"
- ".inst 0x45849812 // usmmla z18.s, z0.b, z4.b\n"
- ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x458a985c // usmmla z28.s, z2.b, z10.b\n"
- ".inst 0x45899859 // usmmla z25.s, z2.b, z9.b\n"
- ".inst 0x4588985d // usmmla z29.s, z2.b, z8.b\n"
- ".inst 0x4584985a // usmmla z26.s, z2.b, z4.b\n"
- ".inst 0x45879816 // usmmla z22.s, z0.b, z7.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x4587985e // usmmla z30.s, z2.b, z7.b\n"
- ".inst 0x45869813 // usmmla z19.s, z0.b, z6.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x4586985b // usmmla z27.s, z2.b, z6.b\n"
- ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x4585985f // usmmla z31.s, z2.b, z5.b\n"
- ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n"
- ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n"
- ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n"
- ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n"
- ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n"
- ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n"
- ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n"
- ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n"
- ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n"
- ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n"
- ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n"
- ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n"
- ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n"
- ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n"
- "tbnz %x[flags], #31, 36f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "udot z13.s, z3.b, z15.b\n"
- "36:" // Height 3: Multiply loop: unique 5: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 35b\n"
- "37:" // Height 3: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z3.b }, p0/Z, [x22]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z5.d\n"
- "trn2 z3.d, z3.d, z5.d\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45849810 // usmmla z16.s, z0.b, z4.b\n"
- ".inst 0x458a9814 // usmmla z20.s, z0.b, z10.b\n"
- ".inst 0x45899811 // usmmla z17.s, z0.b, z9.b\n"
- ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n"
- ".inst 0x45879812 // usmmla z18.s, z0.b, z7.b\n"
- ".inst 0x45849858 // usmmla z24.s, z2.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x458a985c // usmmla z28.s, z2.b, z10.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45899859 // usmmla z25.s, z2.b, z9.b\n"
- ".inst 0x4588985d // usmmla z29.s, z2.b, z8.b\n"
- ".inst 0x4587985a // usmmla z26.s, z2.b, z7.b\n"
- ".inst 0x45869816 // usmmla z22.s, z0.b, z6.b\n"
- ".inst 0x4586985e // usmmla z30.s, z2.b, z6.b\n"
- ".inst 0x45859813 // usmmla z19.s, z0.b, z5.b\n"
- ".inst 0x4585985b // usmmla z27.s, z2.b, z5.b\n"
- ".inst 0x45849817 // usmmla z23.s, z0.b, z4.b\n"
- ".inst 0x4584985f // usmmla z31.s, z2.b, z4.b\n"
- "ble 38f\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n"
- ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n"
- ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n"
- ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n"
- ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n"
- ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n"
- ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n"
- ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n"
- ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n"
- ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n"
- ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n"
- ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n"
- ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n"
- ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n"
- "38:" // Height 3: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 39f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "udot z13.s, z3.b, z15.b\n"
- "39:" // Height 3: Multiply loop: unique 6: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 32b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z0.d, z16.d, z20.d\n"
- "uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "uzp2 z17.d, z17.d, z21.d\n"
- "uzp1 z21.d, z18.d, z22.d\n"
- "uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x27, x20\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "add x22, x23, x20\n"
- "uzp1 z24.d, z24.d, z28.d\n"
- "uzp1 z25.d, z25.d, z29.d\n"
- "uzp1 z26.d, z26.d, z30.d\n"
- "uzp1 z27.d, z27.d, z31.d\n"
- "mov z31.d, z0.d\n"
- "tbnz %x[flags], #31, 40f\n"
- "add x20, %x[qp], %[b_offset]\n"
- ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
- "neg z23.s, p2/M, z23.s\n"
- "mov z12.s, z11.s[3]\n"
- "mov z11.s, z11.s[0]\n"
- "mov z13.s, z13.s[0]\n"
- "mul z11.s, p2/M, z11.s, z23.s\n"
- "mul z12.s, p2/M, z12.s, z23.s\n"
- "mul z13.s, p2/M, z13.s, z23.s\n"
- "40:" // Height 3: skip row sum fixup
- "add z31.s, z31.s, z11.s\n"
- "add z20.s, z20.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z21.s, z21.s, z11.s\n"
- "add z22.s, z22.s, z11.s\n"
- "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add z16.s, z16.s, z12.s\n"
- "add z17.s, z17.s, z12.s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z18.s, z18.s, z12.s\n"
- "add z19.s, z19.s, z12.s\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "addvl x10, x10, #4\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z31.s, z31.s, z0.s\n"
- "add z20.s, z20.s, z30.s\n"
- "add z21.s, z21.s, z29.s\n"
- "add z22.s, z22.s, z28.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z30.s\n"
- "add z18.s, z18.s, z29.s\n"
- "add z19.s, z19.s, z28.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z30.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z26.s, z26.s, z29.s\n"
- "add z27.s, z27.s, z28.s\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
- ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n"
- ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n"
- ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n"
- ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
- ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n"
- ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n"
- ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n"
- ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n"
- "tbz %x[flags], #5, 41f\n"
- "and z1.d, z31.d, z0.d\n"
- "and z30.d, z20.d, z0.d\n"
- "and z29.d, z21.d, z0.d\n"
- "and z28.d, z22.d, z0.d\n"
- "and z23.d, z16.d, z0.d\n"
- "and z3.d, z17.d, z0.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z29.s, z29.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "asr z23.s, z23.s, #0x1f\n"
- "and z2.d, z18.d, z0.d\n"
- "sqadd z31.s, z31.s, z1.s\n"
- "sqadd z20.s, z20.s, z30.s\n"
- "sqadd z21.s, z21.s, z29.s\n"
- "sqadd z22.s, z22.s, z28.s\n"
- "sqadd z16.s, z16.s, z23.s\n"
- "and z1.d, z19.d, z0.d\n"
- "and z30.d, z24.d, z0.d\n"
- "and z29.d, z25.d, z0.d\n"
- "and z28.d, z26.d, z0.d\n"
- "and z23.d, z27.d, z0.d\n"
- "asr z3.s, z3.s, #0x1f\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z29.s, z29.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "asr z23.s, z23.s, #0x1f\n"
- "sqadd z17.s, z17.s, z3.s\n"
- "sqadd z18.s, z18.s, z2.s\n"
- "sqadd z19.s, z19.s, z1.s\n"
- "sqadd z24.s, z24.s, z30.s\n"
- "sqadd z25.s, z25.s, z29.s\n"
- "sqadd z26.s, z26.s, z28.s\n"
- "sqadd z27.s, z27.s, z23.s\n"
- "41:" // Height 3: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "ld1rw { z29.s }, p2/Z, [x20]\n"
- ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z31.s, z31.s, z29.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z20.s, z20.s, z29.s\n"
- "add z21.s, z21.s, z29.s\n"
- ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
- ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z22.s, z22.s, z29.s\n"
- "add z16.s, z16.s, z29.s\n"
- ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "add z17.s, z17.s, z29.s\n"
- "add z18.s, z18.s, z29.s\n"
- "ld1rw { z28.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z29.s\n"
- "add z24.s, z24.s, z29.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z29.s\n"
- "add z26.s, z26.s, z29.s\n"
- "ld1rw { z23.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z29.s\n"
- "smin z31.s, p2/M, z31.s, z28.s\n"
- "smin z20.s, p2/M, z20.s, z28.s\n"
- "smin z21.s, p2/M, z21.s, z28.s\n"
- "smin z22.s, p2/M, z22.s, z28.s\n"
- "smin z16.s, p2/M, z16.s, z28.s\n"
- "smin z17.s, p2/M, z17.s, z28.s\n"
- "smin z18.s, p2/M, z18.s, z28.s\n"
- "smin z19.s, p2/M, z19.s, z28.s\n"
- "smin z24.s, p2/M, z24.s, z28.s\n"
- "smin z25.s, p2/M, z25.s, z28.s\n"
- "smin z26.s, p2/M, z26.s, z28.s\n"
- "smin z27.s, p2/M, z27.s, z28.s\n"
- "smax z31.s, p2/M, z31.s, z23.s\n"
- "smax z20.s, p2/M, z20.s, z23.s\n"
- "smax z21.s, p2/M, z21.s, z23.s\n"
- "smax z22.s, p2/M, z22.s, z23.s\n"
- "smax z16.s, p2/M, z16.s, z23.s\n"
- "smax z17.s, p2/M, z17.s, z23.s\n"
- "smax z18.s, p2/M, z18.s, z23.s\n"
- "smax z19.s, p2/M, z19.s, z23.s\n"
- "uzp1 z31.h, z31.h, z20.h\n"
- "smax z24.s, p2/M, z24.s, z23.s\n"
- "smax z25.s, p2/M, z25.s, z23.s\n"
- "uzp1 z20.h, z21.h, z22.h\n"
- "smax z26.s, p2/M, z26.s, z23.s\n"
- "smax z27.s, p2/M, z27.s, z23.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "uzp1 z18.h, z18.h, z19.h\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z31.b, z31.b, z20.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "uzp1 z16.b, z16.b, z18.b\n"
- "st1b { z31.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z16.b }, p1, [x23]\n"
- "st1b { z24.b }, p1, [x22]\n"
- "42:" // Height 3: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 30b\n"
- "b 58f\n"
- "43:" // Height 4
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x4\n"
- "mov x10, %x[col_bias]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x27\n"
- "mov z15.b, #0x1\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "44:" // Height 4: Column loop
- "mov x20, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "whilelt p1.b, x20, x9\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "45:" // Height 4: setup done
- "mov x26, #0x0\n"
- "46:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w25, [x20, x26, LSL #0x2]\n"
- "tbz %x[flags], #3, 47f\n"
- "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x24, [x20, #0x0]\n"
- "ldr x23, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "ldr x21, [x20, #0x18]\n"
- "cbnz x26, 48f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 48f\n"
- "47:" // Height 4: setup direct input
- "mov x24, %x[input_ptr]\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "48:" // Height 4: input setup done
- "cmp x25, #0x10\n"
- "ble 51f\n"
- "49:" // Height 4: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
- "add x21, x21, #0x10\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z6.d\n"
- "trn2 z3.d, z3.d, z6.d\n"
- "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n"
- ".inst 0x45849814 // usmmla z20.s, z0.b, z4.b\n"
- ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n"
- ".inst 0x45899815 // usmmla z21.s, z0.b, z9.b\n"
- ".inst 0x45889812 // usmmla z18.s, z0.b, z8.b\n"
- ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- "addvl x28, x28, #16\n"
- ".inst 0x4584985c // usmmla z28.s, z2.b, z4.b\n"
- ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n"
- ".inst 0x4589985d // usmmla z29.s, z2.b, z9.b\n"
- ".inst 0x4588985a // usmmla z26.s, z2.b, z8.b\n"
- ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x458a985e // usmmla z30.s, z2.b, z10.b\n"
- ".inst 0x45869813 // usmmla z19.s, z0.b, z6.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x4586985b // usmmla z27.s, z2.b, z6.b\n"
- ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x4585985f // usmmla z31.s, z2.b, z5.b\n"
- ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n"
- ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n"
- ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n"
- ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n"
- ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n"
- ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n"
- ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n"
- ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n"
- ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n"
- ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n"
- ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n"
- ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n"
- ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n"
- ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n"
- "tbnz %x[flags], #31, 50f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "udot z13.s, z3.b, z15.b\n"
- "50:" // Height 4: Multiply loop: unique 7: skip row sum
- "sub x25, x25, #0x10\n"
- "cmp x25, #0x10\n"
- "bgt 49b\n"
- "51:" // Height 4: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x25\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
- "subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z5.b }, p0/Z, [x21]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z5.d\n"
- "trn2 z3.d, z3.d, z5.d\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45869810 // usmmla z16.s, z0.b, z6.b\n"
- ".inst 0x45849814 // usmmla z20.s, z0.b, z4.b\n"
- ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n"
- ".inst 0x45899815 // usmmla z21.s, z0.b, z9.b\n"
- ".inst 0x45889812 // usmmla z18.s, z0.b, z8.b\n"
- ".inst 0x45869858 // usmmla z24.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x4584985c // usmmla z28.s, z2.b, z4.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n"
- ".inst 0x4589985d // usmmla z29.s, z2.b, z9.b\n"
- ".inst 0x4588985a // usmmla z26.s, z2.b, z8.b\n"
- ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n"
- ".inst 0x458a985e // usmmla z30.s, z2.b, z10.b\n"
- ".inst 0x45859813 // usmmla z19.s, z0.b, z5.b\n"
- ".inst 0x4585985b // usmmla z27.s, z2.b, z5.b\n"
- ".inst 0x45869817 // usmmla z23.s, z0.b, z6.b\n"
- ".inst 0x4586985f // usmmla z31.s, z2.b, z6.b\n"
- "ble 52f\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n"
- ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n"
- ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n"
- ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n"
- ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n"
- "addvl x28, x28, #8\n"
- ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n"
- ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n"
- ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n"
- ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n"
- ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n"
- ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n"
- ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n"
- ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n"
- ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n"
- ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n"
- "52:" // Height 4: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 53f\n"
- "udot z11.s, z0.b, z15.b\n"
- "udot z13.s, z2.b, z15.b\n"
- "udot z11.s, z1.b, z15.b\n"
- "udot z13.s, z3.b, z15.b\n"
- "53:" // Height 4: Multiply loop: unique 8: skip row sum
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x26, x26, #0x1\n"
- "cmp x26, x20\n"
- "bne 46b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z0.d, z16.d, z20.d\n"
- "uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "uzp2 z17.d, z17.d, z21.d\n"
- "uzp1 z21.d, z18.d, z22.d\n"
- "uzp2 z18.d, z18.d, z22.d\n"
- "add x23, x27, x20\n"
- "add x22, x23, x20\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "add x21, x22, x20\n"
- "uzp1 z23.d, z24.d, z28.d\n"
- "uzp2 z24.d, z24.d, z28.d\n"
- "uzp1 z28.d, z25.d, z29.d\n"
- "uzp2 z25.d, z25.d, z29.d\n"
- "uzp1 z29.d, z26.d, z30.d\n"
- "uzp2 z26.d, z26.d, z30.d\n"
- "uzp1 z30.d, z27.d, z31.d\n"
- "uzp2 z27.d, z27.d, z31.d\n"
- "mov z31.d, z0.d\n"
- "tbnz %x[flags], #31, 54f\n"
- "add x20, %x[qp], %[b_offset]\n"
- ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "neg z0.s, p2/M, z0.s\n"
- "mov z12.s, z11.s[3]\n"
- "mov z11.s, z11.s[0]\n"
- "mov z14.s, z13.s[3]\n"
- "mov z13.s, z13.s[0]\n"
- "mul z11.s, p2/M, z11.s, z0.s\n"
- "mul z12.s, p2/M, z12.s, z0.s\n"
- "mul z13.s, p2/M, z13.s, z0.s\n"
- "mul z14.s, p2/M, z14.s, z0.s\n"
- "54:" // Height 4: skip row sum fixup
- "add z31.s, z31.s, z11.s\n"
- "add z20.s, z20.s, z11.s\n"
- "ld1w { z4.s }, p2/Z, [x10]\n"
- "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add z21.s, z21.s, z11.s\n"
- "add z22.s, z22.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add z16.s, z16.s, z12.s\n"
- "add z17.s, z17.s, z12.s\n"
- "add x20, %x[qp], %[per_layer_mul]\n"
- "orr %x[flags], %x[flags], #0x80000000\n"
- "add z18.s, z18.s, z12.s\n"
- "add z19.s, z19.s, z12.s\n"
- "ld1rw { z1.s }, p2/Z, [x20]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z23.s, z23.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "addvl x10, x10, #4\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
- "add z31.s, z31.s, z4.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z3.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z16.s, z16.s, z4.s\n"
- "add z17.s, z17.s, z0.s\n"
- "add z18.s, z18.s, z3.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z28.s, z28.s, z0.s\n"
- "add z29.s, z29.s, z3.s\n"
- "add z30.s, z30.s, z2.s\n"
- "add z24.s, z24.s, z4.s\n"
- "add z25.s, z25.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z26.s, z26.s, z3.s\n"
- "add z27.s, z27.s, z2.s\n"
- ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n"
- ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n"
- ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n"
- ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n"
- ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n"
- ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n"
- ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n"
- ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n"
- ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
- ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
- ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n"
- ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n"
- ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n"
- ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n"
- ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
- ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
- "tbz %x[flags], #5, 55f\n"
- "and z2.d, z31.d, z0.d\n"
- "and z1.d, z20.d, z0.d\n"
- "and z7.d, z21.d, z0.d\n"
- "and z6.d, z22.d, z0.d\n"
- "and z5.d, z16.d, z0.d\n"
- "and z4.d, z17.d, z0.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "and z3.d, z18.d, z0.d\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z31.s, z31.s, z2.s\n"
- "sqadd z20.s, z20.s, z1.s\n"
- "and z2.d, z19.d, z0.d\n"
- "and z1.d, z23.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z21.s, z21.s, z7.s\n"
- "sqadd z22.s, z22.s, z6.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z16.s, z16.s, z5.s\n"
- "sqadd z17.s, z17.s, z4.s\n"
- "sqadd z18.s, z18.s, z3.s\n"
- "and z7.d, z28.d, z0.d\n"
- "sqadd z19.s, z19.s, z2.s\n"
- "sqadd z23.s, z23.s, z1.s\n"
- "and z6.d, z29.d, z0.d\n"
- "and z5.d, z30.d, z0.d\n"
- "and z4.d, z24.d, z0.d\n"
- "and z3.d, z25.d, z0.d\n"
- "and z2.d, z26.d, z0.d\n"
- "and z1.d, z27.d, z0.d\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z3.s, z3.s, #0x1f\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z28.s, z28.s, z7.s\n"
- "sqadd z29.s, z29.s, z6.s\n"
- "sqadd z30.s, z30.s, z5.s\n"
- "sqadd z24.s, z24.s, z4.s\n"
- "sqadd z25.s, z25.s, z3.s\n"
- "sqadd z26.s, z26.s, z2.s\n"
- "sqadd z27.s, z27.s, z1.s\n"
- "55:" // Height 4: no shift correction
- "add x20, %x[qp], %[c_offset]\n"
- ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "ld1rw { z2.s }, p2/Z, [x20]\n"
- ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
- ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z31.s, z31.s, z2.s\n"
- ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z20.s, z20.s, z2.s\n"
- "add z21.s, z21.s, z2.s\n"
- ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z16.s, z16.s, z2.s\n"
- ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
- ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z17.s, z17.s, z2.s\n"
- "add z18.s, z18.s, z2.s\n"
- ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
- ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z19.s, z19.s, z2.s\n"
- "add z23.s, z23.s, z2.s\n"
- ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x20, %x[qp], %[maxval]\n"
- "add z28.s, z28.s, z2.s\n"
- "add z29.s, z29.s, z2.s\n"
- "ld1rw { z1.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z2.s\n"
- "add z24.s, z24.s, z2.s\n"
- "add x20, %x[qp], %[minval]\n"
- "add z25.s, z25.s, z2.s\n"
- "add z26.s, z26.s, z2.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z2.s\n"
- "smin z31.s, p2/M, z31.s, z1.s\n"
- "smin z20.s, p2/M, z20.s, z1.s\n"
- "smin z21.s, p2/M, z21.s, z1.s\n"
- "smin z22.s, p2/M, z22.s, z1.s\n"
- "smin z16.s, p2/M, z16.s, z1.s\n"
- "smin z17.s, p2/M, z17.s, z1.s\n"
- "smin z18.s, p2/M, z18.s, z1.s\n"
- "smin z19.s, p2/M, z19.s, z1.s\n"
- "smin z23.s, p2/M, z23.s, z1.s\n"
- "smin z28.s, p2/M, z28.s, z1.s\n"
- "smin z29.s, p2/M, z29.s, z1.s\n"
- "smin z30.s, p2/M, z30.s, z1.s\n"
- "smin z24.s, p2/M, z24.s, z1.s\n"
- "smin z25.s, p2/M, z25.s, z1.s\n"
- "smin z26.s, p2/M, z26.s, z1.s\n"
- "smin z27.s, p2/M, z27.s, z1.s\n"
- "smax z31.s, p2/M, z31.s, z0.s\n"
- "smax z20.s, p2/M, z20.s, z0.s\n"
- "smax z21.s, p2/M, z21.s, z0.s\n"
- "smax z22.s, p2/M, z22.s, z0.s\n"
- "smax z16.s, p2/M, z16.s, z0.s\n"
- "smax z17.s, p2/M, z17.s, z0.s\n"
- "smax z18.s, p2/M, z18.s, z0.s\n"
- "smax z19.s, p2/M, z19.s, z0.s\n"
- "uzp1 z31.h, z31.h, z20.h\n"
- "smax z23.s, p2/M, z23.s, z0.s\n"
- "smax z28.s, p2/M, z28.s, z0.s\n"
- "uzp1 z20.h, z21.h, z22.h\n"
- "smax z29.s, p2/M, z29.s, z0.s\n"
- "smax z30.s, p2/M, z30.s, z0.s\n"
- "uzp1 z16.h, z16.h, z17.h\n"
- "smax z24.s, p2/M, z24.s, z0.s\n"
- "smax z25.s, p2/M, z25.s, z0.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "smax z26.s, p2/M, z26.s, z0.s\n"
- "smax z27.s, p2/M, z27.s, z0.s\n"
- "uzp1 z23.h, z23.h, z28.h\n"
- "uzp1 z31.b, z31.b, z20.b\n"
- "uzp1 z18.h, z29.h, z30.h\n"
- "uzp1 z24.h, z24.h, z25.h\n"
- "uzp1 z16.b, z16.b, z17.b\n"
- "uzp1 z17.h, z26.h, z27.h\n"
- "st1b { z31.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "uzp1 z23.b, z23.b, z18.b\n"
- "uzp1 z24.b, z24.b, z17.b\n"
- "st1b { z16.b }, p1, [x23]\n"
- "st1b { z23.b }, p1, [x22]\n"
- "st1b { z24.b }, p1, [x21]\n"
- "56:" // Height 4: Writeback done
- "decw x9, ALL, MUL #4\n"
- "cmp x9, XZR\n"
- "bgt 44b\n"
- "subs %x[M], %x[M], #0x4\n"
- "beq 58f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 57f\n"
- "add x21, x21, #0x4\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "57:" // Update direct input
- "mov x20, #0x4\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "58:" // Exit
- : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL.hpp
deleted file mode 100644
index 7a8ee8ecb8..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../std_transforms_sve.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- unsigned int, const unsigned int *, \
- IndirectInputArg<uint8_t>, \
- size_t, size_t, \
- const int8_t *, \
- IndirectOutputArg<int32_t>, \
- const int32_t *, Activation, bool
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void sve_hybrid_u8s8s32_mmla_6x4VL( ARGLIST );
-
-class cls_sve_hybrid_u8s8s32_mmla_6x4VL
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef int32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 6;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<int32_t>() * 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 8;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 8> transforms = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
- if (std::is_same<T, uint32_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 54.45 };
- case CPUModel::A510:
- return { 24.22 };
- case CPUModel::V1:
- return { 105.16 };
- }
- }
-
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 54.90, 15.69, 0.62 };
- case CPUModel::A510:
- return { 26.80, 3.89, 0.47 };
- case CPUModel::V1:
- return { 75.14, 15.87, 0.83 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_u8s8s32_mmla_6x4VL;
- cls_sve_hybrid_u8s8s32_mmla_6x4VL(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL/generic.cpp
deleted file mode 100644
index 14299e80d6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8s32_mmla_6x4VL/generic.cpp
+++ /dev/null
@@ -1,1675 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-
-namespace arm_gemm {
-
-void sve_hybrid_u8s8s32_mmla_6x4VL (
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
- size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
- const int32_t *, Activation, bool accumulate
-)
-{
- struct KernelArgs {
- unsigned int num_strings = {};
- const unsigned int *string_lengths = {};
- size_t N = {};
- const int8_t *B_ptr = {};
- size_t output_offset = {};
- size_t input_initial_col = {};
- size_t input_offset = {};
- void *output_ptr = {};
- } ka;
-
- unsigned long flags=0;
- void *input_ptr;
-
- if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
- ka.output_offset=output_arg.indirect.offset;
- flags |= 0x4;
- } else {
- ka.output_ptr=(void *)(output_arg.direct.base);
- ka.output_offset=output_arg.direct.stride;
- }
-
- if (A_arg.is_indirect) {
- input_ptr=(void *)(A_arg.indirect.ptr);
- ka.input_offset=A_arg.indirect.start_row;
- ka.input_initial_col=A_arg.indirect.start_col;
- flags |= 0x8;
- } else {
- assert(num_strings==1);
- input_ptr=(void *)(A_arg.direct.base);
- ka.input_offset=A_arg.direct.stride;
- }
- if (accumulate) {
- flags |= 0x1;
- }
- ka.num_strings = num_strings;
- ka.string_lengths = string_lengths;
- ka.N = N;
- ka.B_ptr = B_ptr;
- __asm__ __volatile__(
- "ptrue p5.b\n"
- "1:" // Row loop
- "cmp %x[M], #0x6\n"
- "bge 56f\n"
- "cmp %x[M], #0x4\n"
- "bgt 45f\n"
- "beq 34f\n"
- "cmp %x[M], #0x2\n"
- "bgt 23f\n"
- "beq 12f\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "2:" // Height 1: Column loop
- "mov x20, #0x0\n"
- "whilelt p4.s, x20, x11\n"
- "incw x20\n"
- "whilelt p3.s, x20, x11\n"
- "incw x20\n"
- "whilelt p2.s, x20, x11\n"
- "incw x20\n"
- "whilelt p1.s, x20, x11\n"
- "tbz %x[flags], #0, 3f\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z19.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "zip1 z9.d, z19.d, z13.d\n"
- "zip2 z13.d, z19.d, z13.d\n"
- "zip1 z10.d, z17.d, z14.d\n"
- "zip2 z14.d, z17.d, z14.d\n"
- "zip1 z11.d, z16.d, z15.d\n"
- "zip2 z15.d, z16.d, z15.d\n"
- "b 4f\n"
- "3:" // Height 1: no accumulate
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x28, #0x0\n"
- "5:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 7f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "ble 9f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z16.b }, p5/Z, [x10]\n"
- "ld1b { z17.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "trn1 z18.d, z19.d, z20.d\n"
- "trn2 z19.d, z19.d, z20.d\n"
- ".inst 0x45909a48 // usmmla z8.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45919a4c // usmmla z12.s, z18.b, z17.b\n"
- "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45909a49 // usmmla z9.s, z18.b, z16.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45949a4d // usmmla z13.s, z18.b, z20.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45819a4a // usmmla z10.s, z18.b, z1.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45909a4e // usmmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x45919a4b // usmmla z11.s, z18.b, z17.b\n"
- ".inst 0x45909a4f // usmmla z15.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45909a68 // usmmla z8.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45919a6c // usmmla z12.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45909a69 // usmmla z9.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45919a6d // usmmla z13.s, z19.b, z17.b\n"
- "ld1b { z3.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45909a6a // usmmla z10.s, z19.b, z16.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45839a6e // usmmla z14.s, z19.b, z3.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45919a6b // usmmla z11.s, z19.b, z17.b\n"
- ".inst 0x45909a6f // usmmla z15.s, z19.b, z16.b\n"
- "bgt 8b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z18.d, z1.d, z19.d\n"
- ".inst 0x45919a48 // usmmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45909a4c // usmmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
- ".inst 0x45919a49 // usmmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45909a4d // usmmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45919a4a // usmmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45909a4e // usmmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x45919a4b // usmmla z11.s, z18.b, z17.b\n"
- ".inst 0x45909a4f // usmmla z15.s, z18.b, z16.b\n"
- "ble 10f\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45919828 // usmmla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4590982c // usmmla z12.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45919829 // usmmla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4590982d // usmmla z13.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4591982a // usmmla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4590982e // usmmla z14.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x4591982b // usmmla z11.s, z1.b, z17.b\n"
- ".inst 0x4590982f // usmmla z15.s, z1.b, z16.b\n"
- "10:" // Height 1: Multiply loop: multiply skip
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 5b\n"
- "uzp1 z8.d, z8.d, z12.d\n"
- "uzp1 z9.d, z9.d, z13.d\n"
- "uzp1 z10.d, z10.d, z14.d\n"
- "uzp1 z11.d, z11.d, z15.d\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "11:" // Height 1: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
- "bgt 2b\n"
- "b 68f\n"
- "12:" // Height 2
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "13:" // Height 2: Column loop
- "mov x20, #0x0\n"
- "whilelt p4.s, x20, x11\n"
- "incw x20\n"
- "whilelt p3.s, x20, x11\n"
- "incw x20\n"
- "whilelt p2.s, x20, x11\n"
- "incw x20\n"
- "whilelt p1.s, x20, x11\n"
- "tbz %x[flags], #0, 14f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z18.s }, p4/Z, [x9]\n"
- "ld1w { z24.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x20]\n"
- "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z8.d, z18.d, z12.d\n"
- "zip2 z12.d, z18.d, z12.d\n"
- "zip1 z9.d, z24.d, z13.d\n"
- "zip2 z13.d, z24.d, z13.d\n"
- "zip1 z10.d, z17.d, z14.d\n"
- "zip2 z14.d, z17.d, z14.d\n"
- "zip1 z11.d, z16.d, z15.d\n"
- "zip2 z15.d, z16.d, z15.d\n"
- "b 15f\n"
- "14:" // Height 2: no accumulate
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "15:" // Height 2: setup done
- "mov x28, #0x0\n"
- "16:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 17f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "cbnz x28, 18f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "b 18f\n"
- "17:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "18:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "ble 20f\n"
- "19:" // Height 2: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "trn1 z18.d, z19.d, z25.d\n"
- "trn2 z19.d, z19.d, z25.d\n"
- ".inst 0x45919a48 // usmmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45909a4c // usmmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45919a49 // usmmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45909a4d // usmmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45919a4a // usmmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45909a4e // usmmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x45919a4b // usmmla z11.s, z18.b, z17.b\n"
- ".inst 0x45909a4f // usmmla z15.s, z18.b, z16.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45919a68 // usmmla z8.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45909a6c // usmmla z12.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45919a69 // usmmla z9.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45909a6d // usmmla z13.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45919a6a // usmmla z10.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45909a6e // usmmla z14.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45919a6b // usmmla z11.s, z19.b, z17.b\n"
- ".inst 0x45909a6f // usmmla z15.s, z19.b, z16.b\n"
- "bgt 19b\n"
- "20:" // Height 2: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z19.b }, p0/Z, [x25]\n"
- "trn1 z18.d, z1.d, z19.d\n"
- ".inst 0x45919a48 // usmmla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45909a4c // usmmla z12.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
- ".inst 0x45919a49 // usmmla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45909a4d // usmmla z13.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45919a4a // usmmla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45909a4e // usmmla z14.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x45919a4b // usmmla z11.s, z18.b, z17.b\n"
- ".inst 0x45909a4f // usmmla z15.s, z18.b, z16.b\n"
- "ble 21f\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45919828 // usmmla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4590982c // usmmla z12.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45919829 // usmmla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4590982d // usmmla z13.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4591982a // usmmla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4590982e // usmmla z14.s, z1.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x4591982b // usmmla z11.s, z1.b, z17.b\n"
- ".inst 0x4590982f // usmmla z15.s, z1.b, z16.b\n"
- "21:" // Height 2: Multiply loop: multiply skip
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 16b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z17.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z16.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z12.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
- "add x20, x9, x20, LSL #2\n"
- "uzp1 z26.d, z11.d, z15.d\n"
- "uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z17.s }, p4, [x9]\n"
- "st1w { z16.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z12.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z26.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x20]\n"
- "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
- "22:" // Height 2: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
- "bgt 13b\n"
- "b 68f\n"
- "23:" // Height 3
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "24:" // Height 3: Column loop
- "mov x20, #0x0\n"
- "whilelt p4.s, x20, x11\n"
- "incw x20\n"
- "whilelt p3.s, x20, x11\n"
- "incw x20\n"
- "whilelt p2.s, x20, x11\n"
- "incw x20\n"
- "whilelt p1.s, x20, x11\n"
- "tbz %x[flags], #0, 25f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z24.s }, p4/Z, [x9]\n"
- "ld1w { z26.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x21]\n"
- "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x20]\n"
- "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
- "zip1 z8.d, z24.d, z12.d\n"
- "zip2 z12.d, z24.d, z12.d\n"
- "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z9.d, z26.d, z13.d\n"
- "zip2 z13.d, z26.d, z13.d\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "zip1 z11.d, z16.d, z15.d\n"
- "zip2 z15.d, z16.d, z15.d\n"
- "zip1 z16.d, z17.d, z20.d\n"
- "zip2 z20.d, z17.d, z20.d\n"
- "zip1 z17.d, z18.d, z21.d\n"
- "zip2 z21.d, z18.d, z21.d\n"
- "zip1 z18.d, z19.d, z22.d\n"
- "zip2 z22.d, z19.d, z22.d\n"
- "zip1 z19.d, z24.d, z23.d\n"
- "zip2 z23.d, z24.d, z23.d\n"
- "b 26f\n"
- "25:" // Height 3: no accumulate
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "26:" // Height 3: setup done
- "mov x28, #0x0\n"
- "27:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 28f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "cbnz x28, 29f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "b 29f\n"
- "28:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "29:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "ble 31f\n"
- "30:" // Height 3: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z27.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z24.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z26.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "trn1 z6.d, z27.d, z24.d\n"
- "trn2 z27.d, z27.d, z24.d\n"
- "trn1 z30.d, z26.d, z29.d\n"
- "trn2 z26.d, z26.d, z29.d\n"
- ".inst 0x459998c8 // usmmla z8.s, z6.b, z25.b\n"
- ".inst 0x459c98cc // usmmla z12.s, z6.b, z28.b\n"
- ".inst 0x45999bd0 // usmmla z16.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x459c9bd4 // usmmla z20.s, z30.b, z28.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x459998c9 // usmmla z9.s, z6.b, z25.b\n"
- ".inst 0x45999bd1 // usmmla z17.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x459898cd // usmmla z13.s, z6.b, z24.b\n"
- ".inst 0x45989bd5 // usmmla z21.s, z30.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x459998ca // usmmla z10.s, z6.b, z25.b\n"
- ".inst 0x45999bd2 // usmmla z18.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x459898ce // usmmla z14.s, z6.b, z24.b\n"
- ".inst 0x45989bd6 // usmmla z22.s, z30.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x459998cb // usmmla z11.s, z6.b, z25.b\n"
- ".inst 0x45999bd3 // usmmla z19.s, z30.b, z25.b\n"
- ".inst 0x459898cf // usmmla z15.s, z6.b, z24.b\n"
- ".inst 0x45989bd7 // usmmla z23.s, z30.b, z24.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45999b68 // usmmla z8.s, z27.b, z25.b\n"
- ".inst 0x45999b50 // usmmla z16.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45989b6c // usmmla z12.s, z27.b, z24.b\n"
- ".inst 0x45989b54 // usmmla z20.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45999b69 // usmmla z9.s, z27.b, z25.b\n"
- ".inst 0x45999b51 // usmmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45989b6d // usmmla z13.s, z27.b, z24.b\n"
- ".inst 0x45989b55 // usmmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45999b6a // usmmla z10.s, z27.b, z25.b\n"
- ".inst 0x45999b52 // usmmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45989b6e // usmmla z14.s, z27.b, z24.b\n"
- ".inst 0x45989b56 // usmmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45999b6b // usmmla z11.s, z27.b, z25.b\n"
- ".inst 0x45999b53 // usmmla z19.s, z26.b, z25.b\n"
- ".inst 0x45989b6f // usmmla z15.s, z27.b, z24.b\n"
- ".inst 0x45989b57 // usmmla z23.s, z26.b, z24.b\n"
- "bgt 30b\n"
- "31:" // Height 3: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z24.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z27.d, z1.d, z24.d\n"
- "trn2 z1.d, z1.d, z24.d\n"
- "trn1 z26.d, z3.d, z29.d\n"
- ".inst 0x45999b68 // usmmla z8.s, z27.b, z25.b\n"
- ".inst 0x459c9b6c // usmmla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z29.d\n"
- ".inst 0x45999b50 // usmmla z16.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x459c9b54 // usmmla z20.s, z26.b, z28.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45999b69 // usmmla z9.s, z27.b, z25.b\n"
- ".inst 0x45999b51 // usmmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45989b6d // usmmla z13.s, z27.b, z24.b\n"
- ".inst 0x45989b55 // usmmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45999b6a // usmmla z10.s, z27.b, z25.b\n"
- ".inst 0x45999b52 // usmmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45989b6e // usmmla z14.s, z27.b, z24.b\n"
- ".inst 0x45989b56 // usmmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x45999b6b // usmmla z11.s, z27.b, z25.b\n"
- ".inst 0x45999b53 // usmmla z19.s, z26.b, z25.b\n"
- ".inst 0x45989b6f // usmmla z15.s, z27.b, z24.b\n"
- ".inst 0x45989b57 // usmmla z23.s, z26.b, z24.b\n"
- "ble 32f\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45999828 // usmmla z8.s, z1.b, z25.b\n"
- ".inst 0x45999870 // usmmla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4598982c // usmmla z12.s, z1.b, z24.b\n"
- ".inst 0x45989874 // usmmla z20.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45999829 // usmmla z9.s, z1.b, z25.b\n"
- ".inst 0x45999871 // usmmla z17.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4598982d // usmmla z13.s, z1.b, z24.b\n"
- ".inst 0x45989875 // usmmla z21.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4599982a // usmmla z10.s, z1.b, z25.b\n"
- ".inst 0x45999872 // usmmla z18.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4598982e // usmmla z14.s, z1.b, z24.b\n"
- ".inst 0x45989876 // usmmla z22.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x4599982b // usmmla z11.s, z1.b, z25.b\n"
- ".inst 0x45999873 // usmmla z19.s, z3.b, z25.b\n"
- ".inst 0x4598982f // usmmla z15.s, z1.b, z24.b\n"
- ".inst 0x45989877 // usmmla z23.s, z3.b, z24.b\n"
- "32:" // Height 3: Multiply loop: multiply skip
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 27b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z27.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z26.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z25.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 z24.d, z11.d, z15.d\n"
- "uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z27.s }, p4, [x9]\n"
- "uzp1 z16.d, z16.d, z20.d\n"
- "uzp1 z17.d, z17.d, z21.d\n"
- "st1w { z26.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z18.d, z18.d, z22.d\n"
- "uzp1 z19.d, z19.d, z23.d\n"
- "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x21]\n"
- "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x20]\n"
- "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
- "33:" // Height 3: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
- "bgt 24b\n"
- "b 68f\n"
- "34:" // Height 4
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "35:" // Height 4: Column loop
- "mov x20, #0x0\n"
- "whilelt p4.s, x20, x11\n"
- "incw x20\n"
- "whilelt p3.s, x20, x11\n"
- "incw x20\n"
- "whilelt p2.s, x20, x11\n"
- "incw x20\n"
- "whilelt p1.s, x20, x11\n"
- "tbz %x[flags], #0, 36f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x22]\n"
- "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x21]\n"
- "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x20]\n"
- "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "zip1 z11.d, z16.d, z15.d\n"
- "zip2 z15.d, z16.d, z15.d\n"
- "zip1 z16.d, z17.d, z20.d\n"
- "zip2 z20.d, z17.d, z20.d\n"
- "zip1 z17.d, z18.d, z21.d\n"
- "zip2 z21.d, z18.d, z21.d\n"
- "zip1 z18.d, z19.d, z22.d\n"
- "zip2 z22.d, z19.d, z22.d\n"
- "zip1 z19.d, z24.d, z23.d\n"
- "zip2 z23.d, z24.d, z23.d\n"
- "b 37f\n"
- "36:" // Height 4: no accumulate
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "37:" // Height 4: setup done
- "mov x28, #0x0\n"
- "38:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 39f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "cbnz x28, 40f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "b 40f\n"
- "39:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "40:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "ble 42f\n"
- "41:" // Height 4: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z31.b }, p5/Z, [x10]\n"
- "ld1b { z30.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z29.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z28.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "trn1 z27.d, z29.d, z25.d\n"
- "trn2 z29.d, z29.d, z25.d\n"
- "trn1 z26.d, z28.d, z24.d\n"
- "trn2 z28.d, z28.d, z24.d\n"
- ".inst 0x459f9b68 // usmmla z8.s, z27.b, z31.b\n"
- ".inst 0x459e9b6c // usmmla z12.s, z27.b, z30.b\n"
- ".inst 0x459f9b50 // usmmla z16.s, z26.b, z31.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x459e9b54 // usmmla z20.s, z26.b, z30.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45999b69 // usmmla z9.s, z27.b, z25.b\n"
- ".inst 0x45999b51 // usmmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45989b6d // usmmla z13.s, z27.b, z24.b\n"
- ".inst 0x45989b55 // usmmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45999b6a // usmmla z10.s, z27.b, z25.b\n"
- ".inst 0x45999b52 // usmmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45989b6e // usmmla z14.s, z27.b, z24.b\n"
- ".inst 0x45989b56 // usmmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x45999b6b // usmmla z11.s, z27.b, z25.b\n"
- ".inst 0x45999b53 // usmmla z19.s, z26.b, z25.b\n"
- ".inst 0x45989b6f // usmmla z15.s, z27.b, z24.b\n"
- ".inst 0x45989b57 // usmmla z23.s, z26.b, z24.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45999ba8 // usmmla z8.s, z29.b, z25.b\n"
- ".inst 0x45999b90 // usmmla z16.s, z28.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45989bac // usmmla z12.s, z29.b, z24.b\n"
- ".inst 0x45989b94 // usmmla z20.s, z28.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45999ba9 // usmmla z9.s, z29.b, z25.b\n"
- ".inst 0x45999b91 // usmmla z17.s, z28.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45989bad // usmmla z13.s, z29.b, z24.b\n"
- ".inst 0x45989b95 // usmmla z21.s, z28.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45999baa // usmmla z10.s, z29.b, z25.b\n"
- ".inst 0x45999b92 // usmmla z18.s, z28.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45989bae // usmmla z14.s, z29.b, z24.b\n"
- ".inst 0x45989b96 // usmmla z22.s, z28.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45999bab // usmmla z11.s, z29.b, z25.b\n"
- ".inst 0x45999b93 // usmmla z19.s, z28.b, z25.b\n"
- ".inst 0x45989baf // usmmla z15.s, z29.b, z24.b\n"
- ".inst 0x45989b97 // usmmla z23.s, z28.b, z24.b\n"
- "bgt 41b\n"
- "42:" // Height 4: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "trn1 z27.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- "trn1 z26.d, z3.d, z24.d\n"
- ".inst 0x459d9b68 // usmmla z8.s, z27.b, z29.b\n"
- ".inst 0x459c9b6c // usmmla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z24.d\n"
- ".inst 0x459d9b50 // usmmla z16.s, z26.b, z29.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x459c9b54 // usmmla z20.s, z26.b, z28.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45999b69 // usmmla z9.s, z27.b, z25.b\n"
- ".inst 0x45999b51 // usmmla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45989b6d // usmmla z13.s, z27.b, z24.b\n"
- ".inst 0x45989b55 // usmmla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45999b6a // usmmla z10.s, z27.b, z25.b\n"
- ".inst 0x45999b52 // usmmla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45989b6e // usmmla z14.s, z27.b, z24.b\n"
- ".inst 0x45989b56 // usmmla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x45999b6b // usmmla z11.s, z27.b, z25.b\n"
- ".inst 0x45999b53 // usmmla z19.s, z26.b, z25.b\n"
- ".inst 0x45989b6f // usmmla z15.s, z27.b, z24.b\n"
- ".inst 0x45989b57 // usmmla z23.s, z26.b, z24.b\n"
- "ble 43f\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45999828 // usmmla z8.s, z1.b, z25.b\n"
- ".inst 0x45999870 // usmmla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4598982c // usmmla z12.s, z1.b, z24.b\n"
- ".inst 0x45989874 // usmmla z20.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45999829 // usmmla z9.s, z1.b, z25.b\n"
- ".inst 0x45999871 // usmmla z17.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4598982d // usmmla z13.s, z1.b, z24.b\n"
- ".inst 0x45989875 // usmmla z21.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4599982a // usmmla z10.s, z1.b, z25.b\n"
- ".inst 0x45999872 // usmmla z18.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4598982e // usmmla z14.s, z1.b, z24.b\n"
- ".inst 0x45989876 // usmmla z22.s, z3.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x4599982b // usmmla z11.s, z1.b, z25.b\n"
- ".inst 0x45999873 // usmmla z19.s, z3.b, z25.b\n"
- ".inst 0x4598982f // usmmla z15.s, z1.b, z24.b\n"
- ".inst 0x45989877 // usmmla z23.s, z3.b, z24.b\n"
- "43:" // Height 4: Multiply loop: multiply skip
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 38b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z25.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z24.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z27.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 z26.d, z11.d, z15.d\n"
- "uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z25.s }, p4, [x9]\n"
- "uzp1 z25.d, z16.d, z20.d\n"
- "uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z24.d, z17.d, z21.d\n"
- "uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z27.s }, p2, [x9, #2, MUL VL]\n"
- "uzp1 z21.d, z18.d, z22.d\n"
- "uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z26.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "uzp1 z20.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x22]\n"
- "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z25.s }, p4, [x21]\n"
- "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x20]\n"
- "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
- "44:" // Height 4: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
- "bgt 35b\n"
- "b 68f\n"
- "45:" // Height 5
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "46:" // Height 5: Column loop
- "mov x20, #0x0\n"
- "whilelt p4.s, x20, x11\n"
- "incw x20\n"
- "whilelt p3.s, x20, x11\n"
- "incw x20\n"
- "whilelt p2.s, x20, x11\n"
- "incw x20\n"
- "whilelt p1.s, x20, x11\n"
- "tbz %x[flags], #0, 47f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x23]\n"
- "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
- "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x22]\n"
- "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x21]\n"
- "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x20]\n"
- "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
- "zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z16.d, z17.d, z20.d\n"
- "zip2 z20.d, z17.d, z20.d\n"
- "zip1 z17.d, z18.d, z21.d\n"
- "zip2 z21.d, z18.d, z21.d\n"
- "zip1 z18.d, z19.d, z22.d\n"
- "zip2 z22.d, z19.d, z22.d\n"
- "zip1 z19.d, z24.d, z23.d\n"
- "zip2 z23.d, z24.d, z23.d\n"
- "zip1 z24.d, z25.d, z28.d\n"
- "zip2 z28.d, z25.d, z28.d\n"
- "zip1 z25.d, z26.d, z29.d\n"
- "zip2 z29.d, z26.d, z29.d\n"
- "zip1 z26.d, z27.d, z30.d\n"
- "zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z0.d, z31.d\n"
- "zip2 z31.d, z0.d, z31.d\n"
- "b 48f\n"
- "47:" // Height 5: no accumulate
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "48:" // Height 5: setup done
- "mov x28, #0x0\n"
- "49:" // Height 5: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 50f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x28, 51f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "b 51f\n"
- "50:" // Height 5: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "51:" // Height 5: input setup done
- "cmp x27, #0x10\n"
- "ble 53f\n"
- "52:" // Height 5: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z7.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45819888 // usmmla z8.s, z4.b, z1.b\n"
- ".inst 0x45819870 // usmmla z16.s, z3.b, z1.b\n"
- ".inst 0x45819858 // usmmla z24.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4580988c // usmmla z12.s, z4.b, z0.b\n"
- ".inst 0x45809874 // usmmla z20.s, z3.b, z0.b\n"
- ".inst 0x4580985c // usmmla z28.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45819889 // usmmla z9.s, z4.b, z1.b\n"
- ".inst 0x45819871 // usmmla z17.s, z3.b, z1.b\n"
- ".inst 0x45819859 // usmmla z25.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4580988d // usmmla z13.s, z4.b, z0.b\n"
- ".inst 0x45809875 // usmmla z21.s, z3.b, z0.b\n"
- ".inst 0x4580985d // usmmla z29.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4581988a // usmmla z10.s, z4.b, z1.b\n"
- ".inst 0x45819872 // usmmla z18.s, z3.b, z1.b\n"
- ".inst 0x4581985a // usmmla z26.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4580988e // usmmla z14.s, z4.b, z0.b\n"
- ".inst 0x45809876 // usmmla z22.s, z3.b, z0.b\n"
- ".inst 0x4580985e // usmmla z30.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x4581988b // usmmla z11.s, z4.b, z1.b\n"
- ".inst 0x45819873 // usmmla z19.s, z3.b, z1.b\n"
- ".inst 0x4581985b // usmmla z27.s, z2.b, z1.b\n"
- ".inst 0x4580988f // usmmla z15.s, z4.b, z0.b\n"
- ".inst 0x45809877 // usmmla z23.s, z3.b, z0.b\n"
- ".inst 0x4580985f // usmmla z31.s, z2.b, z0.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x458198c8 // usmmla z8.s, z6.b, z1.b\n"
- ".inst 0x458198f0 // usmmla z16.s, z7.b, z1.b\n"
- ".inst 0x458198b8 // usmmla z24.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x458098cc // usmmla z12.s, z6.b, z0.b\n"
- ".inst 0x458098f4 // usmmla z20.s, z7.b, z0.b\n"
- ".inst 0x458098bc // usmmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x458198c9 // usmmla z9.s, z6.b, z1.b\n"
- ".inst 0x458198f1 // usmmla z17.s, z7.b, z1.b\n"
- ".inst 0x458198b9 // usmmla z25.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x458098cd // usmmla z13.s, z6.b, z0.b\n"
- ".inst 0x458098f5 // usmmla z21.s, z7.b, z0.b\n"
- ".inst 0x458098bd // usmmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x458198ca // usmmla z10.s, z6.b, z1.b\n"
- ".inst 0x458198f2 // usmmla z18.s, z7.b, z1.b\n"
- ".inst 0x458198ba // usmmla z26.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x458098ce // usmmla z14.s, z6.b, z0.b\n"
- ".inst 0x458098f6 // usmmla z22.s, z7.b, z0.b\n"
- ".inst 0x458098be // usmmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x458198cb // usmmla z11.s, z6.b, z1.b\n"
- ".inst 0x458198f3 // usmmla z19.s, z7.b, z1.b\n"
- ".inst 0x458198bb // usmmla z27.s, z5.b, z1.b\n"
- ".inst 0x458098cf // usmmla z15.s, z6.b, z0.b\n"
- ".inst 0x458098f7 // usmmla z23.s, z7.b, z0.b\n"
- ".inst 0x458098bf // usmmla z31.s, z5.b, z0.b\n"
- "bgt 52b\n"
- "53:" // Height 5: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x458298e8 // usmmla z8.s, z7.b, z2.b\n"
- ".inst 0x458298d0 // usmmla z16.s, z6.b, z2.b\n"
- ".inst 0x45829898 // usmmla z24.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x458098ec // usmmla z12.s, z7.b, z0.b\n"
- ".inst 0x458098d4 // usmmla z20.s, z6.b, z0.b\n"
- ".inst 0x4580989c // usmmla z28.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x458298e9 // usmmla z9.s, z7.b, z2.b\n"
- ".inst 0x458298d1 // usmmla z17.s, z6.b, z2.b\n"
- ".inst 0x45829899 // usmmla z25.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x458098ed // usmmla z13.s, z7.b, z0.b\n"
- ".inst 0x458098d5 // usmmla z21.s, z6.b, z0.b\n"
- ".inst 0x4580989d // usmmla z29.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x458298ea // usmmla z10.s, z7.b, z2.b\n"
- ".inst 0x458298d2 // usmmla z18.s, z6.b, z2.b\n"
- ".inst 0x4582989a // usmmla z26.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x458098ee // usmmla z14.s, z7.b, z0.b\n"
- ".inst 0x458098d6 // usmmla z22.s, z6.b, z0.b\n"
- ".inst 0x4580989e // usmmla z30.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x458298eb // usmmla z11.s, z7.b, z2.b\n"
- ".inst 0x458298d3 // usmmla z19.s, z6.b, z2.b\n"
- ".inst 0x4582989b // usmmla z27.s, z4.b, z2.b\n"
- ".inst 0x458098ef // usmmla z15.s, z7.b, z0.b\n"
- ".inst 0x458098d7 // usmmla z23.s, z6.b, z0.b\n"
- ".inst 0x4580989f // usmmla z31.s, z4.b, z0.b\n"
- "ble 54f\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45829828 // usmmla z8.s, z1.b, z2.b\n"
- ".inst 0x45829870 // usmmla z16.s, z3.b, z2.b\n"
- ".inst 0x458298b8 // usmmla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4580982c // usmmla z12.s, z1.b, z0.b\n"
- ".inst 0x45809874 // usmmla z20.s, z3.b, z0.b\n"
- ".inst 0x458098bc // usmmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45829829 // usmmla z9.s, z1.b, z2.b\n"
- ".inst 0x45829871 // usmmla z17.s, z3.b, z2.b\n"
- ".inst 0x458298b9 // usmmla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4580982d // usmmla z13.s, z1.b, z0.b\n"
- ".inst 0x45809875 // usmmla z21.s, z3.b, z0.b\n"
- ".inst 0x458098bd // usmmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4582982a // usmmla z10.s, z1.b, z2.b\n"
- ".inst 0x45829872 // usmmla z18.s, z3.b, z2.b\n"
- ".inst 0x458298ba // usmmla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4580982e // usmmla z14.s, z1.b, z0.b\n"
- ".inst 0x45809876 // usmmla z22.s, z3.b, z0.b\n"
- ".inst 0x458098be // usmmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x4582982b // usmmla z11.s, z1.b, z2.b\n"
- ".inst 0x45829873 // usmmla z19.s, z3.b, z2.b\n"
- ".inst 0x458298bb // usmmla z27.s, z5.b, z2.b\n"
- ".inst 0x4580982f // usmmla z15.s, z1.b, z0.b\n"
- ".inst 0x45809877 // usmmla z23.s, z3.b, z0.b\n"
- ".inst 0x458098bf // usmmla z31.s, z5.b, z0.b\n"
- "54:" // Height 5: Multiply loop: multiply skip
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 49b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z1.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z0.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z3.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "uzp1 z2.d, z11.d, z15.d\n"
- "uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z1.s }, p4, [x9]\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 z1.d, z16.d, z20.d\n"
- "uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z0.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z0.d, z17.d, z21.d\n"
- "uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z3.s }, p2, [x9, #2, MUL VL]\n"
- "uzp1 z21.d, z18.d, z22.d\n"
- "uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "uzp1 z20.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x23]\n"
- "uzp1 z24.d, z24.d, z28.d\n"
- "uzp1 z25.d, z25.d, z29.d\n"
- "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
- "uzp1 z26.d, z26.d, z30.d\n"
- "uzp1 z27.d, z27.d, z31.d\n"
- "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z1.s }, p4, [x22]\n"
- "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x21]\n"
- "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x20]\n"
- "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
- "55:" // Height 5: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
- "bgt 46b\n"
- "b 68f\n"
- "56:" // Height 6
- "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "mov x20, #0x18\n"
- "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
- "57:" // Height 6: Column loop
- "mov x20, #0x0\n"
- "whilelt p4.s, x20, x11\n"
- "incw x20\n"
- "whilelt p3.s, x20, x11\n"
- "incw x20\n"
- "whilelt p2.s, x20, x11\n"
- "incw x20\n"
- "whilelt p1.s, x20, x11\n"
- "tbz %x[flags], #0, 58f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
- "zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z16.d, z17.d, z20.d\n"
- "zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z28.s }, p4/Z, [x20]\n"
- "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z17.d, z18.d, z21.d\n"
- "zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z18.d, z19.d, z22.d\n"
- "zip2 z22.d, z19.d, z22.d\n"
- "zip1 z19.d, z24.d, z23.d\n"
- "zip2 z23.d, z24.d, z23.d\n"
- "zip1 z24.d, z25.d, z28.d\n"
- "zip2 z28.d, z25.d, z28.d\n"
- "zip1 z25.d, z26.d, z29.d\n"
- "zip2 z29.d, z26.d, z29.d\n"
- "zip1 z26.d, z27.d, z30.d\n"
- "zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z0.d, z31.d\n"
- "zip2 z31.d, z0.d, z31.d\n"
- "b 59f\n"
- "58:" // Height 6: no accumulate
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "mov z21.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "59:" // Height 6: setup done
- "mov x28, #0x0\n"
- "60:" // Height 6: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 61f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x20, x20, x21, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x25, [x20, #0x8]\n"
- "ldr x24, [x20, #0x10]\n"
- "ldr x23, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "ldr x21, [x20, #0x28]\n"
- "cbnz x28, 62f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x20\n"
- "add x25, x25, x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
- "b 62f\n"
- "61:" // Height 6: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x25, x26, x21\n"
- "add x24, x25, x21\n"
- "add x23, x24, x21\n"
- "add x22, x23, x21\n"
- "add x21, x22, x21\n"
- "62:" // Height 6: input setup done
- "cmp x27, #0x10\n"
- "ble 64f\n"
- "63:" // Height 6: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z7.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z0.b }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "add x21, x21, #0x10\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45819888 // usmmla z8.s, z4.b, z1.b\n"
- ".inst 0x45819870 // usmmla z16.s, z3.b, z1.b\n"
- ".inst 0x45819858 // usmmla z24.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4580988c // usmmla z12.s, z4.b, z0.b\n"
- ".inst 0x45809874 // usmmla z20.s, z3.b, z0.b\n"
- ".inst 0x4580985c // usmmla z28.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45819889 // usmmla z9.s, z4.b, z1.b\n"
- ".inst 0x45819871 // usmmla z17.s, z3.b, z1.b\n"
- ".inst 0x45819859 // usmmla z25.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4580988d // usmmla z13.s, z4.b, z0.b\n"
- ".inst 0x45809875 // usmmla z21.s, z3.b, z0.b\n"
- ".inst 0x4580985d // usmmla z29.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4581988a // usmmla z10.s, z4.b, z1.b\n"
- ".inst 0x45819872 // usmmla z18.s, z3.b, z1.b\n"
- ".inst 0x4581985a // usmmla z26.s, z2.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4580988e // usmmla z14.s, z4.b, z0.b\n"
- ".inst 0x45809876 // usmmla z22.s, z3.b, z0.b\n"
- ".inst 0x4580985e // usmmla z30.s, z2.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x4581988b // usmmla z11.s, z4.b, z1.b\n"
- ".inst 0x45819873 // usmmla z19.s, z3.b, z1.b\n"
- ".inst 0x4581985b // usmmla z27.s, z2.b, z1.b\n"
- ".inst 0x4580988f // usmmla z15.s, z4.b, z0.b\n"
- ".inst 0x45809877 // usmmla z23.s, z3.b, z0.b\n"
- ".inst 0x4580985f // usmmla z31.s, z2.b, z0.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x458198c8 // usmmla z8.s, z6.b, z1.b\n"
- ".inst 0x458198f0 // usmmla z16.s, z7.b, z1.b\n"
- ".inst 0x458198b8 // usmmla z24.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x458098cc // usmmla z12.s, z6.b, z0.b\n"
- ".inst 0x458098f4 // usmmla z20.s, z7.b, z0.b\n"
- ".inst 0x458098bc // usmmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x458198c9 // usmmla z9.s, z6.b, z1.b\n"
- ".inst 0x458198f1 // usmmla z17.s, z7.b, z1.b\n"
- ".inst 0x458198b9 // usmmla z25.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x458098cd // usmmla z13.s, z6.b, z0.b\n"
- ".inst 0x458098f5 // usmmla z21.s, z7.b, z0.b\n"
- ".inst 0x458098bd // usmmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x458198ca // usmmla z10.s, z6.b, z1.b\n"
- ".inst 0x458198f2 // usmmla z18.s, z7.b, z1.b\n"
- ".inst 0x458198ba // usmmla z26.s, z5.b, z1.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x458098ce // usmmla z14.s, z6.b, z0.b\n"
- ".inst 0x458098f6 // usmmla z22.s, z7.b, z0.b\n"
- ".inst 0x458098be // usmmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x458198cb // usmmla z11.s, z6.b, z1.b\n"
- ".inst 0x458198f3 // usmmla z19.s, z7.b, z1.b\n"
- ".inst 0x458198bb // usmmla z27.s, z5.b, z1.b\n"
- ".inst 0x458098cf // usmmla z15.s, z6.b, z0.b\n"
- ".inst 0x458098f7 // usmmla z23.s, z7.b, z0.b\n"
- ".inst 0x458098bf // usmmla z31.s, z5.b, z0.b\n"
- "bgt 63b\n"
- "64:" // Height 6: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z0.b }, p0/Z, [x21]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x458298e8 // usmmla z8.s, z7.b, z2.b\n"
- ".inst 0x458298d0 // usmmla z16.s, z6.b, z2.b\n"
- ".inst 0x45829898 // usmmla z24.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x458098ec // usmmla z12.s, z7.b, z0.b\n"
- ".inst 0x458098d4 // usmmla z20.s, z6.b, z0.b\n"
- ".inst 0x4580989c // usmmla z28.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x458298e9 // usmmla z9.s, z7.b, z2.b\n"
- ".inst 0x458298d1 // usmmla z17.s, z6.b, z2.b\n"
- ".inst 0x45829899 // usmmla z25.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x458098ed // usmmla z13.s, z7.b, z0.b\n"
- ".inst 0x458098d5 // usmmla z21.s, z6.b, z0.b\n"
- ".inst 0x4580989d // usmmla z29.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x458298ea // usmmla z10.s, z7.b, z2.b\n"
- ".inst 0x458298d2 // usmmla z18.s, z6.b, z2.b\n"
- ".inst 0x4582989a // usmmla z26.s, z4.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x458098ee // usmmla z14.s, z7.b, z0.b\n"
- ".inst 0x458098d6 // usmmla z22.s, z6.b, z0.b\n"
- ".inst 0x4580989e // usmmla z30.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x458298eb // usmmla z11.s, z7.b, z2.b\n"
- ".inst 0x458298d3 // usmmla z19.s, z6.b, z2.b\n"
- ".inst 0x4582989b // usmmla z27.s, z4.b, z2.b\n"
- ".inst 0x458098ef // usmmla z15.s, z7.b, z0.b\n"
- ".inst 0x458098d7 // usmmla z23.s, z6.b, z0.b\n"
- ".inst 0x4580989f // usmmla z31.s, z4.b, z0.b\n"
- "ble 65f\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45829828 // usmmla z8.s, z1.b, z2.b\n"
- ".inst 0x45829870 // usmmla z16.s, z3.b, z2.b\n"
- ".inst 0x458298b8 // usmmla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x4580982c // usmmla z12.s, z1.b, z0.b\n"
- ".inst 0x45809874 // usmmla z20.s, z3.b, z0.b\n"
- ".inst 0x458098bc // usmmla z28.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45829829 // usmmla z9.s, z1.b, z2.b\n"
- ".inst 0x45829871 // usmmla z17.s, z3.b, z2.b\n"
- ".inst 0x458298b9 // usmmla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4580982d // usmmla z13.s, z1.b, z0.b\n"
- ".inst 0x45809875 // usmmla z21.s, z3.b, z0.b\n"
- ".inst 0x458098bd // usmmla z29.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4582982a // usmmla z10.s, z1.b, z2.b\n"
- ".inst 0x45829872 // usmmla z18.s, z3.b, z2.b\n"
- ".inst 0x458298ba // usmmla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4580982e // usmmla z14.s, z1.b, z0.b\n"
- ".inst 0x45809876 // usmmla z22.s, z3.b, z0.b\n"
- ".inst 0x458098be // usmmla z30.s, z5.b, z0.b\n"
- "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
- ".inst 0x4582982b // usmmla z11.s, z1.b, z2.b\n"
- ".inst 0x45829873 // usmmla z19.s, z3.b, z2.b\n"
- ".inst 0x458298bb // usmmla z27.s, z5.b, z2.b\n"
- ".inst 0x4580982f // usmmla z15.s, z1.b, z0.b\n"
- ".inst 0x45809877 // usmmla z23.s, z3.b, z0.b\n"
- ".inst 0x458098bf // usmmla z31.s, z5.b, z0.b\n"
- "65:" // Height 6: Multiply loop: multiply skip
- "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x20\n"
- "bne 60b\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z0.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z0.s }, p4, [x9]\n"
- "add x21, x22, x20, LSL #2\n"
- "uzp1 z15.d, z16.d, z20.d\n"
- "uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
- "uzp1 z21.d, z18.d, z22.d\n"
- "uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x24]\n"
- "uzp1 z23.d, z24.d, z28.d\n"
- "uzp2 z24.d, z24.d, z28.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
- "uzp1 z28.d, z25.d, z29.d\n"
- "uzp2 z25.d, z25.d, z29.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
- "uzp1 z29.d, z26.d, z30.d\n"
- "uzp2 z26.d, z26.d, z30.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
- "uzp1 z30.d, z27.d, z31.d\n"
- "uzp2 z27.d, z27.d, z31.d\n"
- "st1w { z15.s }, p4, [x23]\n"
- "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x22]\n"
- "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z23.s }, p4, [x21]\n"
- "st1w { z28.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z30.s }, p1, [x21, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x20]\n"
- "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
- "66:" // Height 6: Writeback done
- "decw x11, ALL, MUL #4\n"
- "cmp x11, XZR\n"
- "bgt 57b\n"
- "subs %x[M], %x[M], #0x6\n"
- "beq 68f\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 67f\n"
- "add x21, x21, #0x6\n"
- "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "b 1b\n"
- "67:" // Update direct input
- "mov x20, #0x6\n"
- "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
- "b 1b\n"
- "68:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index 12e99fb526..920fca738c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -71,7 +71,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
index 3f074fad7d..6a0aeb79b4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
@@ -44,18 +44,18 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -89,7 +89,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"beq 11f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -114,8 +114,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -134,14 +134,14 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop
"udot z8.s, z6.b, z0.b\n"
- "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x4\n"
- "subs x27, x27, #0x4\n"
"udot z10.s, z17.b, z0.b\n"
"udot z11.s, z16.b, z0.b\n"
+ "subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -149,14 +149,14 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"9:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"udot z8.s, z6.b, z0.b\n"
- "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"udot z10.s, z17.b, z0.b\n"
"udot z11.s, z16.b, z0.b\n"
+ "addvl x10, x10, #4\n"
"bne 5b\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -171,7 +171,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"11:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"12:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -183,11 +183,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 13f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x20]\n"
"ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
@@ -206,8 +206,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"15:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -242,8 +242,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z10.s, z17.b, z0.b\n"
"udot z14.s, z17.b, z1.b\n"
"udot z11.s, z16.b, z0.b\n"
- "ld1rw { z0.s }, p4/Z, [x26]\n"
"udot z15.s, z16.b, z1.b\n"
+ "ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -257,18 +257,18 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z13.s, z7.b, z1.b\n"
"ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
- "addvl x10, x10, #4\n"
"cmp x28, x20\n"
"udot z10.s, z17.b, z0.b\n"
"udot z14.s, z17.b, z1.b\n"
+ "addvl x10, x10, #4\n"
"udot z11.s, z16.b, z0.b\n"
"udot z15.s, z16.b, z1.b\n"
"bne 15b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p3, [x20]\n"
@@ -283,7 +283,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"21:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"22:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -295,12 +295,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x21]\n"
"ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
@@ -327,8 +327,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"25:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 26f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -359,8 +359,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x26, x26, #0x4\n"
"subs x27, x27, #0x4\n"
"udot z16.s, z6.b, z2.b\n"
- "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
@@ -372,11 +372,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z18.s, z21.b, z2.b\n"
"udot z11.s, z20.b, z0.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
+ "ld1b { z6.b }, p4/Z, [x10]\n"
"udot z15.s, z20.b, z1.b\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"udot z19.s, z20.b, z2.b\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 28b\n"
"29:" // Height 3: Multiply loop: Main loop skip
@@ -385,13 +385,13 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z12.s, z6.b, z1.b\n"
"add x28, x28, #0x1\n"
"udot z16.s, z6.b, z2.b\n"
- "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
"ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "cmp x28, x20\n"
"udot z10.s, z21.b, z0.b\n"
"udot z14.s, z21.b, z1.b\n"
"udot z18.s, z21.b, z2.b\n"
@@ -400,11 +400,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z19.s, z20.b, z2.b\n"
"bne 25b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p3, [x21]\n"
@@ -423,7 +423,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"31:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"32:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -435,13 +435,13 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x22]\n"
"ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
@@ -476,8 +476,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"35:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -527,6 +527,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z14.s, z25.b, z1.b\n"
"udot z18.s, z25.b, z2.b\n"
"udot z22.s, z25.b, z3.b\n"
+ "ld1b { z6.b }, p4/Z, [x10]\n"
"udot z11.s, z24.b, z0.b\n"
"udot z15.s, z24.b, z1.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -535,7 +536,6 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z23.s, z24.b, z3.b\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
- "ld1b { z6.b }, p4/Z, [x10]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 38b\n"
"39:" // Height 4: Multiply loop: Main loop skip
@@ -546,15 +546,15 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z16.s, z6.b, z2.b\n"
"udot z20.s, z6.b, z3.b\n"
"ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "cmp x28, x20\n"
"udot z9.s, z7.b, z0.b\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
"udot z21.s, z7.b, z3.b\n"
"ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
- "cmp x28, x20\n"
+ "addvl x10, x10, #4\n"
"udot z10.s, z25.b, z0.b\n"
"udot z14.s, z25.b, z1.b\n"
- "addvl x10, x10, #4\n"
"udot z18.s, z25.b, z2.b\n"
"udot z22.s, z25.b, z3.b\n"
"udot z11.s, z24.b, z0.b\n"
@@ -563,12 +563,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z23.s, z24.b, z3.b\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p3, [x9]\n"
- "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p3, [x22]\n"
@@ -591,7 +591,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"41:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"42:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -603,16 +603,16 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x23]\n"
"ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x22]\n"
@@ -653,8 +653,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"45:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -697,8 +697,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
"udot z24.s, z6.b, z4.b\n"
- "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x4\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
@@ -716,12 +716,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
"udot z15.s, z28.b, z1.b\n"
- "ld1rw { z1.s }, p4/Z, [x25]\n"
"udot z19.s, z28.b, z2.b\n"
+ "ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"udot z23.s, z28.b, z3.b\n"
- "ld1rw { z3.s }, p4/Z, [x23]\n"
"udot z27.s, z28.b, z4.b\n"
+ "ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
"bgt 48b\n"
@@ -732,12 +732,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"udot z16.s, z6.b, z2.b\n"
"udot z20.s, z6.b, z3.b\n"
+ "cmp x28, x20\n"
"udot z24.s, z6.b, z4.b\n"
- "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
- "cmp x28, x20\n"
"udot z21.s, z7.b, z3.b\n"
"udot z25.s, z7.b, z4.b\n"
"ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
@@ -754,15 +754,15 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z27.s, z28.b, z4.b\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p3, [x9]\n"
- "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z12.s }, p3, [x23]\n"
"st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
"st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
@@ -786,12 +786,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"b 62f\n"
"51:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"52:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p3.s, x20, x11\n"
@@ -803,17 +802,17 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 53f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p3/Z, [x24]\n"
"ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p3/Z, [x23]\n"
@@ -862,8 +861,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"mov x28, #0x0\n"
"55:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 56f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -951,12 +950,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"udot z16.s, z6.b, z2.b\n"
"udot z20.s, z6.b, z3.b\n"
+ "cmp x28, x20\n"
"udot z24.s, z6.b, z4.b\n"
"udot z28.s, z6.b, z5.b\n"
"ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
"udot z13.s, z7.b, z1.b\n"
- "cmp x28, x20\n"
"udot z17.s, z7.b, z2.b\n"
"udot z21.s, z7.b, z3.b\n"
"udot z25.s, z7.b, z4.b\n"
@@ -977,17 +976,17 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"udot z31.s, z7.b, z5.b\n"
"bne 55b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p3, [x9]\n"
- "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x21, x22, x20, LSL #2\n"
"st1w { z12.s }, p3, [x24]\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
"st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
"st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
@@ -1023,8 +1022,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"62:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index f2dee3c0bb..03b41dabe4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -44,18 +44,18 @@ void sve_hybrid_u8u32_dot_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -89,7 +89,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
"beq 12f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -114,8 +114,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov x28, #0x0\n"
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -131,89 +131,89 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "udot z8.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "udot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z10.s, z16.b, z0.b[0]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"udot z11.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "udot z8.s, z16.b, z0.b[1]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- "udot z8.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"udot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z10.s, z16.b, z0.b[1]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z10.s, z17.b, z0.b[1]\n"
"udot z11.s, z16.b, z0.b[1]\n"
"ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
"udot z8.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
"udot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
"udot z10.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
"udot z11.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
"udot z8.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
"udot z10.s, z17.b, z0.b[3]\n"
"udot z11.s, z16.b, z0.b[3]\n"
+ "add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "udot z8.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "udot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"udot z10.s, z17.b, z0.b[0]\n"
"udot z11.s, z16.b, z0.b[0]\n"
+ "addvl x10, x10, #4\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z17.b, z0.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"udot z10.s, z17.b, z0.b[1]\n"
"udot z11.s, z16.b, z0.b[1]\n"
+ "addvl x10, x10, #4\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z17.b, z0.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"udot z10.s, z17.b, z0.b[2]\n"
"udot z11.s, z16.b, z0.b[2]\n"
+ "addvl x10, x10, #4\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z17.b, z0.b[3]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"udot z10.s, z17.b, z0.b[3]\n"
"udot z11.s, z16.b, z0.b[3]\n"
+ "addvl x10, x10, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -232,7 +232,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
"12:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"13:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -244,11 +244,11 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
@@ -267,8 +267,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov x28, #0x0\n"
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -287,38 +287,38 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z17.b, z1.b[0]\n"
"udot z12.s, z17.b, z0.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z1.b[0]\n"
"udot z13.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"udot z10.s, z17.b, z1.b[0]\n"
"udot z14.s, z17.b, z0.b[0]\n"
"ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "cmp x27, #0x10\n"
"udot z11.s, z16.b, z1.b[0]\n"
"udot z15.s, z16.b, z0.b[0]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"udot z8.s, z17.b, z1.b[1]\n"
"udot z12.s, z17.b, z0.b[1]\n"
"ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z16.b, z1.b[1]\n"
"udot z13.s, z16.b, z0.b[1]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"udot z10.s, z17.b, z1.b[1]\n"
"udot z14.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"udot z11.s, z16.b, z1.b[1]\n"
"udot z15.s, z16.b, z0.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
"udot z8.s, z17.b, z1.b[2]\n"
"udot z12.s, z17.b, z0.b[2]\n"
@@ -345,50 +345,50 @@ void sve_hybrid_u8u32_dot_6x4VL (
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z17.b, z0.b[0]\n"
"udot z12.s, z17.b, z1.b[0]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[0]\n"
"udot z13.s, z16.b, z1.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"udot z10.s, z17.b, z0.b[0]\n"
"udot z14.s, z17.b, z1.b[0]\n"
+ "addvl x10, x10, #4\n"
"udot z11.s, z16.b, z0.b[0]\n"
"udot z15.s, z16.b, z1.b[0]\n"
"ble 21f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z17.b, z0.b[1]\n"
"udot z12.s, z17.b, z1.b[1]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[1]\n"
"udot z13.s, z16.b, z1.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"udot z10.s, z17.b, z0.b[1]\n"
"udot z14.s, z17.b, z1.b[1]\n"
+ "addvl x10, x10, #4\n"
"udot z11.s, z16.b, z0.b[1]\n"
"udot z15.s, z16.b, z1.b[1]\n"
"ble 21f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z17.b, z0.b[2]\n"
"udot z12.s, z17.b, z1.b[2]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[2]\n"
"udot z13.s, z16.b, z1.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "subs x27, x27, #0x4\n"
"udot z10.s, z17.b, z0.b[2]\n"
"udot z14.s, z17.b, z1.b[2]\n"
+ "addvl x10, x10, #4\n"
"udot z11.s, z16.b, z0.b[2]\n"
"udot z15.s, z16.b, z1.b[2]\n"
"ble 21f\n"
@@ -396,13 +396,13 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z17.b, z0.b[3]\n"
"udot z12.s, z17.b, z1.b[3]\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z16.b, z0.b[3]\n"
"udot z13.s, z16.b, z1.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
"udot z10.s, z17.b, z0.b[3]\n"
"udot z14.s, z17.b, z1.b[3]\n"
+ "addvl x10, x10, #4\n"
"udot z11.s, z16.b, z0.b[3]\n"
"udot z15.s, z16.b, z1.b[3]\n"
"21:" // Height 2: Multiply loop: multiply skip
@@ -411,10 +411,10 @@ void sve_hybrid_u8u32_dot_6x4VL (
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p4, [x20]\n"
@@ -429,7 +429,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
"23:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"24:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -441,12 +441,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
@@ -473,8 +473,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov x28, #0x0\n"
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -496,37 +496,37 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z21.b }, p5/Z, [x10]\n"
- "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
"udot z8.s, z21.b, z2.b[0]\n"
"udot z12.s, z21.b, z1.b[0]\n"
- "udot z9.s, z20.b, z2.b[0]\n"
- "udot z13.s, z20.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z16.s, z21.b, z0.b[0]\n"
+ "udot z9.s, z20.b, z2.b[0]\n"
"ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[0]\n"
"udot z17.s, z20.b, z0.b[0]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x27, #0x10\n"
"udot z10.s, z21.b, z2.b[0]\n"
"udot z14.s, z21.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z18.s, z21.b, z0.b[0]\n"
- "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
"udot z11.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"udot z15.s, z20.b, z1.b[0]\n"
"udot z19.s, z20.b, z0.b[0]\n"
"ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
"udot z8.s, z21.b, z2.b[1]\n"
"udot z12.s, z21.b, z1.b[1]\n"
"udot z16.s, z21.b, z0.b[1]\n"
- "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
"udot z9.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
"udot z13.s, z20.b, z1.b[1]\n"
"udot z17.s, z20.b, z0.b[1]\n"
"ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
@@ -535,31 +535,31 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z14.s, z21.b, z1.b[1]\n"
"udot z18.s, z21.b, z0.b[1]\n"
"udot z11.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
"udot z15.s, z20.b, z1.b[1]\n"
"udot z19.s, z20.b, z0.b[1]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
"udot z8.s, z21.b, z2.b[2]\n"
"udot z12.s, z21.b, z1.b[2]\n"
"udot z16.s, z21.b, z0.b[2]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
"udot z9.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
"udot z13.s, z20.b, z1.b[2]\n"
"udot z17.s, z20.b, z0.b[2]\n"
"ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
"udot z10.s, z21.b, z2.b[2]\n"
"udot z14.s, z21.b, z1.b[2]\n"
"udot z18.s, z21.b, z0.b[2]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
"udot z11.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
"udot z15.s, z20.b, z1.b[2]\n"
"udot z19.s, z20.b, z0.b[2]\n"
"ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
"udot z8.s, z21.b, z2.b[3]\n"
"udot z12.s, z21.b, z1.b[3]\n"
"udot z16.s, z21.b, z0.b[3]\n"
- "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
"udot z9.s, z20.b, z2.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
"udot z13.s, z20.b, z1.b[3]\n"
"udot z17.s, z20.b, z0.b[3]\n"
"ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
@@ -572,18 +572,18 @@ void sve_hybrid_u8u32_dot_6x4VL (
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z21.b }, p5/Z, [x10]\n"
- "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
"udot z8.s, z21.b, z0.b[0]\n"
"udot z12.s, z21.b, z1.b[0]\n"
- "udot z9.s, z20.b, z0.b[0]\n"
- "udot z13.s, z20.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z16.s, z21.b, z2.b[0]\n"
+ "udot z9.s, z20.b, z0.b[0]\n"
"ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[0]\n"
"udot z17.s, z20.b, z2.b[0]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -596,12 +596,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 32f\n"
"ld1b { z21.b }, p5/Z, [x10]\n"
"ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z21.b, z0.b[1]\n"
"udot z12.s, z21.b, z1.b[1]\n"
"udot z16.s, z21.b, z2.b[1]\n"
- "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"udot z13.s, z20.b, z1.b[1]\n"
"udot z17.s, z20.b, z2.b[1]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -615,12 +615,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 32f\n"
"ld1b { z21.b }, p5/Z, [x10]\n"
"ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z21.b, z0.b[2]\n"
"udot z12.s, z21.b, z1.b[2]\n"
"udot z16.s, z21.b, z2.b[2]\n"
- "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"udot z13.s, z20.b, z1.b[2]\n"
"udot z17.s, z20.b, z2.b[2]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -637,8 +637,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z8.s, z21.b, z0.b[3]\n"
"udot z12.s, z21.b, z1.b[3]\n"
"udot z16.s, z21.b, z2.b[3]\n"
- "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z13.s, z20.b, z1.b[3]\n"
"udot z17.s, z20.b, z2.b[3]\n"
"ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -655,11 +655,11 @@ void sve_hybrid_u8u32_dot_6x4VL (
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p4, [x21]\n"
@@ -678,7 +678,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
"34:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"35:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -690,13 +690,13 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
@@ -731,8 +731,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov x28, #0x0\n"
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -757,25 +757,25 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z25.b, z3.b[0]\n"
"udot z12.s, z25.b, z2.b[0]\n"
- "udot z9.s, z24.b, z3.b[0]\n"
- "udot z13.s, z24.b, z2.b[0]\n"
"udot z16.s, z25.b, z1.b[0]\n"
"udot z20.s, z25.b, z0.b[0]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ "udot z9.s, z24.b, z3.b[0]\n"
+ "udot z13.s, z24.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
"udot z17.s, z24.b, z1.b[0]\n"
"udot z21.s, z24.b, z0.b[0]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -804,9 +804,9 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z14.s, z25.b, z2.b[1]\n"
"udot z18.s, z25.b, z1.b[1]\n"
"udot z22.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"udot z11.s, z24.b, z3.b[1]\n"
"udot z15.s, z24.b, z2.b[1]\n"
- "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"udot z19.s, z24.b, z1.b[1]\n"
"udot z23.s, z24.b, z0.b[1]\n"
"ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
@@ -851,20 +851,20 @@ void sve_hybrid_u8u32_dot_6x4VL (
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z25.b, z0.b[0]\n"
"udot z12.s, z25.b, z1.b[0]\n"
- "udot z9.s, z24.b, z0.b[0]\n"
- "udot z13.s, z24.b, z1.b[0]\n"
"udot z16.s, z25.b, z2.b[0]\n"
"udot z20.s, z25.b, z3.b[0]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z24.b, z0.b[0]\n"
+ "udot z13.s, z24.b, z1.b[0]\n"
"udot z17.s, z24.b, z2.b[0]\n"
"udot z21.s, z24.b, z3.b[0]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -880,12 +880,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 43f\n"
"ld1b { z25.b }, p5/Z, [x10]\n"
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z25.b, z0.b[1]\n"
"udot z12.s, z25.b, z1.b[1]\n"
"udot z16.s, z25.b, z2.b[1]\n"
"udot z20.s, z25.b, z3.b[1]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"udot z9.s, z24.b, z0.b[1]\n"
"udot z13.s, z24.b, z1.b[1]\n"
"udot z17.s, z24.b, z2.b[1]\n"
@@ -903,12 +903,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 43f\n"
"ld1b { z25.b }, p5/Z, [x10]\n"
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z25.b, z0.b[2]\n"
"udot z12.s, z25.b, z1.b[2]\n"
"udot z16.s, z25.b, z2.b[2]\n"
"udot z20.s, z25.b, z3.b[2]\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
"udot z9.s, z24.b, z0.b[2]\n"
"udot z13.s, z24.b, z1.b[2]\n"
"udot z17.s, z24.b, z2.b[2]\n"
@@ -951,12 +951,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z12.s }, p4, [x22]\n"
@@ -979,7 +979,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
"45:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"46:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -991,16 +991,16 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x9, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x23, x9, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x22]\n"
@@ -1041,8 +1041,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1070,29 +1070,29 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z4.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqb { z0.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
"udot z8.s, z29.b, z4.b[0]\n"
"udot z12.s, z29.b, z3.b[0]\n"
- "udot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z16.s, z29.b, z2.b[0]\n"
"udot z20.s, z29.b, z1.b[0]\n"
+ "add x25, x25, #0x10\n"
"udot z24.s, z29.b, z0.b[0]\n"
- "udot z13.s, z28.b, z3.b[0]\n"
+ "udot z9.s, z28.b, z4.b[0]\n"
"ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z28.b, z3.b[0]\n"
"udot z17.s, z28.b, z2.b[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"udot z21.s, z28.b, z1.b[0]\n"
"udot z25.s, z28.b, z0.b[0]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1101,8 +1101,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z18.s, z29.b, z2.b[0]\n"
"udot z22.s, z29.b, z1.b[0]\n"
"udot z26.s, z29.b, z0.b[0]\n"
- "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
"udot z11.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
"udot z15.s, z28.b, z3.b[0]\n"
"udot z19.s, z28.b, z2.b[0]\n"
"udot z23.s, z28.b, z1.b[0]\n"
@@ -1113,8 +1113,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z16.s, z29.b, z2.b[1]\n"
"udot z20.s, z29.b, z1.b[1]\n"
"udot z24.s, z29.b, z0.b[1]\n"
- "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
"udot z9.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
"udot z13.s, z28.b, z3.b[1]\n"
"udot z17.s, z28.b, z2.b[1]\n"
"udot z21.s, z28.b, z1.b[1]\n"
@@ -1127,8 +1127,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z22.s, z29.b, z1.b[1]\n"
"udot z26.s, z29.b, z0.b[1]\n"
"udot z11.s, z28.b, z4.b[1]\n"
- "udot z15.s, z28.b, z3.b[1]\n"
"ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "udot z15.s, z28.b, z3.b[1]\n"
"udot z19.s, z28.b, z2.b[1]\n"
"udot z23.s, z28.b, z1.b[1]\n"
"udot z27.s, z28.b, z0.b[1]\n"
@@ -1138,8 +1138,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z16.s, z29.b, z2.b[2]\n"
"udot z20.s, z29.b, z1.b[2]\n"
"udot z24.s, z29.b, z0.b[2]\n"
- "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
"udot z9.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
"udot z13.s, z28.b, z3.b[2]\n"
"udot z17.s, z28.b, z2.b[2]\n"
"udot z21.s, z28.b, z1.b[2]\n"
@@ -1150,8 +1150,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z18.s, z29.b, z2.b[2]\n"
"udot z22.s, z29.b, z1.b[2]\n"
"udot z26.s, z29.b, z0.b[2]\n"
- "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
"udot z11.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
"udot z15.s, z28.b, z3.b[2]\n"
"udot z19.s, z28.b, z2.b[2]\n"
"udot z23.s, z28.b, z1.b[2]\n"
@@ -1162,8 +1162,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z16.s, z29.b, z2.b[3]\n"
"udot z20.s, z29.b, z1.b[3]\n"
"udot z24.s, z29.b, z0.b[3]\n"
- "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
"udot z9.s, z28.b, z4.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
"udot z13.s, z28.b, z3.b[3]\n"
"udot z17.s, z28.b, z2.b[3]\n"
"udot z21.s, z28.b, z1.b[3]\n"
@@ -1182,23 +1182,23 @@ void sve_hybrid_u8u32_dot_6x4VL (
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
"udot z8.s, z29.b, z0.b[0]\n"
"udot z12.s, z29.b, z1.b[0]\n"
- "udot z9.s, z28.b, z0.b[0]\n"
- "udot z13.s, z28.b, z1.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z16.s, z29.b, z2.b[0]\n"
"udot z20.s, z29.b, z3.b[0]\n"
"udot z24.s, z29.b, z4.b[0]\n"
- "udot z17.s, z28.b, z2.b[0]\n"
+ "udot z9.s, z28.b, z0.b[0]\n"
"ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z28.b, z1.b[0]\n"
+ "udot z17.s, z28.b, z2.b[0]\n"
"udot z21.s, z28.b, z3.b[0]\n"
"udot z25.s, z28.b, z4.b[0]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1216,21 +1216,21 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 54f\n"
"ld1b { z29.b }, p5/Z, [x10]\n"
"ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z29.b, z0.b[1]\n"
"udot z12.s, z29.b, z1.b[1]\n"
"udot z16.s, z29.b, z2.b[1]\n"
"udot z20.s, z29.b, z3.b[1]\n"
+ "subs x27, x27, #0x4\n"
"udot z24.s, z29.b, z4.b[1]\n"
- "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z28.b, z0.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z13.s, z28.b, z1.b[1]\n"
"udot z17.s, z28.b, z2.b[1]\n"
"udot z21.s, z28.b, z3.b[1]\n"
"udot z25.s, z28.b, z4.b[1]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z29.b, z0.b[1]\n"
"addvl x10, x10, #4\n"
+ "udot z10.s, z29.b, z0.b[1]\n"
"udot z14.s, z29.b, z1.b[1]\n"
"udot z18.s, z29.b, z2.b[1]\n"
"udot z22.s, z29.b, z3.b[1]\n"
@@ -1243,21 +1243,21 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 54f\n"
"ld1b { z29.b }, p5/Z, [x10]\n"
"ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z29.b, z0.b[2]\n"
"udot z12.s, z29.b, z1.b[2]\n"
"udot z16.s, z29.b, z2.b[2]\n"
"udot z20.s, z29.b, z3.b[2]\n"
+ "subs x27, x27, #0x4\n"
"udot z24.s, z29.b, z4.b[2]\n"
- "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z28.b, z0.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z13.s, z28.b, z1.b[2]\n"
"udot z17.s, z28.b, z2.b[2]\n"
"udot z21.s, z28.b, z3.b[2]\n"
"udot z25.s, z28.b, z4.b[2]\n"
"ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z29.b, z0.b[2]\n"
"addvl x10, x10, #4\n"
+ "udot z10.s, z29.b, z0.b[2]\n"
"udot z14.s, z29.b, z1.b[2]\n"
"udot z18.s, z29.b, z2.b[2]\n"
"udot z22.s, z29.b, z3.b[2]\n"
@@ -1275,8 +1275,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z16.s, z29.b, z2.b[3]\n"
"udot z20.s, z29.b, z3.b[3]\n"
"udot z24.s, z29.b, z4.b[3]\n"
- "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z28.b, z0.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"udot z13.s, z28.b, z1.b[3]\n"
"udot z17.s, z28.b, z2.b[3]\n"
"udot z21.s, z28.b, z3.b[3]\n"
@@ -1299,15 +1299,15 @@ void sve_hybrid_u8u32_dot_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z12.s }, p4, [x23]\n"
"st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
"st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
@@ -1331,12 +1331,11 @@ void sve_hybrid_u8u32_dot_6x4VL (
"b 68f\n"
"56:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"57:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1348,17 +1347,17 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1407,8 +1406,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov x28, #0x0\n"
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1439,29 +1438,29 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z7.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
"ld1rqb { z6.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "sub x27, x27, #0x10\n"
"ld1rqb { z5.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
"ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z2.b }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z1.b, z7.b[0]\n"
"udot z12.s, z1.b, z6.b[0]\n"
- "add x21, x21, #0x10\n"
"udot z16.s, z1.b, z5.b[0]\n"
"udot z20.s, z1.b, z4.b[0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
"udot z24.s, z1.b, z3.b[0]\n"
"udot z28.s, z1.b, z2.b[0]\n"
"ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"udot z9.s, z0.b, z7.b[0]\n"
"udot z13.s, z0.b, z6.b[0]\n"
"udot z17.s, z0.b, z5.b[0]\n"
@@ -1569,24 +1568,24 @@ void sve_hybrid_u8u32_dot_6x4VL (
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
"udot z8.s, z7.b, z0.b[0]\n"
"udot z12.s, z7.b, z1.b[0]\n"
- "udot z9.s, z6.b, z0.b[0]\n"
- "udot z13.s, z6.b, z1.b[0]\n"
"udot z16.s, z7.b, z2.b[0]\n"
"udot z20.s, z7.b, z3.b[0]\n"
"udot z24.s, z7.b, z4.b[0]\n"
"udot z28.s, z7.b, z5.b[0]\n"
"ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[0]\n"
+ "udot z13.s, z6.b, z1.b[0]\n"
"udot z17.s, z6.b, z2.b[0]\n"
"udot z21.s, z6.b, z3.b[0]\n"
"udot z25.s, z6.b, z4.b[0]\n"
@@ -1608,23 +1607,23 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 65f\n"
"ld1b { z7.b }, p5/Z, [x10]\n"
"ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z7.b, z0.b[1]\n"
"udot z12.s, z7.b, z1.b[1]\n"
"udot z16.s, z7.b, z2.b[1]\n"
"udot z20.s, z7.b, z3.b[1]\n"
+ "subs x27, x27, #0x4\n"
"udot z24.s, z7.b, z4.b[1]\n"
"udot z28.s, z7.b, z5.b[1]\n"
- "udot z9.s, z6.b, z0.b[1]\n"
"ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[1]\n"
"udot z13.s, z6.b, z1.b[1]\n"
"udot z17.s, z6.b, z2.b[1]\n"
"udot z21.s, z6.b, z3.b[1]\n"
"udot z25.s, z6.b, z4.b[1]\n"
"udot z29.s, z6.b, z5.b[1]\n"
"ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z7.b, z0.b[1]\n"
"addvl x10, x10, #4\n"
+ "udot z10.s, z7.b, z0.b[1]\n"
"udot z14.s, z7.b, z1.b[1]\n"
"udot z18.s, z7.b, z2.b[1]\n"
"udot z22.s, z7.b, z3.b[1]\n"
@@ -1639,23 +1638,23 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ble 65f\n"
"ld1b { z7.b }, p5/Z, [x10]\n"
"ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
"udot z8.s, z7.b, z0.b[2]\n"
"udot z12.s, z7.b, z1.b[2]\n"
"udot z16.s, z7.b, z2.b[2]\n"
"udot z20.s, z7.b, z3.b[2]\n"
+ "subs x27, x27, #0x4\n"
"udot z24.s, z7.b, z4.b[2]\n"
"udot z28.s, z7.b, z5.b[2]\n"
- "udot z9.s, z6.b, z0.b[2]\n"
"ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[2]\n"
"udot z13.s, z6.b, z1.b[2]\n"
"udot z17.s, z6.b, z2.b[2]\n"
"udot z21.s, z6.b, z3.b[2]\n"
"udot z25.s, z6.b, z4.b[2]\n"
"udot z29.s, z6.b, z5.b[2]\n"
"ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z7.b, z0.b[2]\n"
"addvl x10, x10, #4\n"
+ "udot z10.s, z7.b, z0.b[2]\n"
"udot z14.s, z7.b, z1.b[2]\n"
"udot z18.s, z7.b, z2.b[2]\n"
"udot z22.s, z7.b, z3.b[2]\n"
@@ -1703,17 +1702,17 @@ void sve_hybrid_u8u32_dot_6x4VL (
"cmp x28, x20\n"
"bne 60b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "st1w { z8.s }, p4, [x9]\n"
- "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "add x21, x22, x20, LSL #2\n"
"st1w { z12.s }, p4, [x24]\n"
- "add x20, x21, x20, LSL #2\n"
"st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
"st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
"st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
@@ -1749,8 +1748,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index 8437b0ea48..2c4eaaab4a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -70,7 +70,7 @@ public:
return true;
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+ StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
index 898ff3a235..32cb778de4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
@@ -44,18 +44,18 @@ void sve_hybrid_u8u32_mmla_6x4VL (
size_t output_offset = {};
size_t input_initial_col = {};
size_t input_offset = {};
- void *output_ptr = {};
} ka;
unsigned long flags=0;
+ void *output_ptr;
void *input_ptr;
if (output_arg.is_indirect) {
- ka.output_ptr=(void *)(output_arg.indirect.ptr);
+ output_ptr=(void *)(output_arg.indirect.ptr);
ka.output_offset=output_arg.indirect.offset;
flags |= 0x4;
} else {
- ka.output_ptr=(void *)(output_arg.direct.base);
+ output_ptr=(void *)(output_arg.direct.base);
ka.output_offset=output_arg.direct.stride;
}
@@ -89,7 +89,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"beq 12f\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"2:" // Height 1: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -100,14 +100,14 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"incw x20\n"
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 3f\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z19.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "zip1 z9.d, z19.d, z13.d\n"
- "zip2 z13.d, z19.d, z13.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
"zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
@@ -126,8 +126,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"mov x28, #0x0\n"
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -143,87 +143,87 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z16.b }, p5/Z, [x10]\n"
- "ld1b { z17.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "trn1 z18.d, z19.d, z20.d\n"
- "trn2 z19.d, z19.d, z20.d\n"
- ".inst 0x45d09a48 // ummla z8.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45d19a4c // ummla z12.s, z18.b, z17.b\n"
- "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45d09a49 // ummla z9.s, z18.b, z16.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45d49a4d // ummla z13.s, z18.b, z20.b\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c19a4a // ummla z10.s, z18.b, z1.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z17.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45d09a68 // ummla z8.s, z19.b, z16.b\n"
- "ld1b { z16.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45d19a6c // ummla z12.s, z19.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45d09a69 // ummla z9.s, z19.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45d19a88 // ummla z8.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8c // ummla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45d19a89 // ummla z9.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8d // ummla z13.s, z20.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45d19a6d // ummla z13.s, z19.b, z17.b\n"
- "ld1b { z3.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45d09a6a // ummla z10.s, z19.b, z16.b\n"
+ "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45d09a8a // ummla z10.s, z20.b, z16.b\n"
+ ".inst 0x45c79a8e // ummla z14.s, z20.b, z7.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45c39a6e // ummla z14.s, z19.b, z3.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45d19a6b // ummla z11.s, z19.b, z17.b\n"
- ".inst 0x45d09a6f // ummla z15.s, z19.b, z16.b\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45d19a8b // ummla z11.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8f // ummla z15.s, z20.b, z16.b\n"
+ "add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "subs x27, x27, #0x8\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"ble 10f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d19828 // ummla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d0982c // ummla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d19829 // ummla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45d0982d // ummla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45d1982a // ummla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45d0982e // ummla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x45d1982b // ummla z11.s, z1.b, z17.b\n"
".inst 0x45d0982f // ummla z15.s, z1.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -231,9 +231,9 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"bne 5b\n"
"uzp1 z8.d, z8.d, z12.d\n"
"uzp1 z9.d, z9.d, z13.d\n"
+ "st1w { z8.s }, p4, [x9]\n"
"uzp1 z10.d, z10.d, z14.d\n"
"uzp1 z11.d, z11.d, z15.d\n"
- "st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
@@ -246,7 +246,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"12:" // Height 2
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"13:" // Height 2: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -258,19 +258,19 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z18.s }, p4/Z, [x9]\n"
- "ld1w { z24.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "add x20, x9, x20, LSL #2\n"
"ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z18.d, z12.d\n"
+ "zip2 z12.d, z18.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z2.d, z13.d\n"
+ "zip2 z13.d, z2.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z8.d, z18.d, z12.d\n"
- "zip2 z12.d, z18.d, z12.d\n"
- "zip1 z9.d, z24.d, z13.d\n"
- "zip2 z13.d, z24.d, z13.d\n"
"zip1 z10.d, z17.d, z14.d\n"
"zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
@@ -289,8 +289,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"mov x28, #0x0\n"
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -309,109 +309,109 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z19.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "trn1 z18.d, z19.d, z25.d\n"
- "trn2 z19.d, z19.d, z25.d\n"
".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
+ "trn2 z20.d, z20.d, z19.d\n"
".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45d19a68 // ummla z8.s, z19.b, z17.b\n"
+ ".inst 0x45d19a88 // ummla z8.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8c // ummla z12.s, z20.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45d09a6c // ummla z12.s, z19.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45d19a69 // ummla z9.s, z19.b, z17.b\n"
+ ".inst 0x45d19a89 // ummla z9.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8d // ummla z13.s, z20.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45d09a6d // ummla z13.s, z19.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45d19a6a // ummla z10.s, z19.b, z17.b\n"
+ ".inst 0x45d19a8a // ummla z10.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8e // ummla z14.s, z20.b, z16.b\n"
"ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45d09a6e // ummla z14.s, z19.b, z16.b\n"
"ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45d19a6b // ummla z11.s, z19.b, z17.b\n"
- ".inst 0x45d09a6f // ummla z15.s, z19.b, z16.b\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45d19a8b // ummla z11.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8f // ummla z15.s, z20.b, z16.b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z17.b }, p5/Z, [x10]\n"
- "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"ld1rqb { z19.b }, p0/Z, [x25]\n"
"trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
+ "subs x27, x27, #0x8\n"
+ "trn2 z1.d, z1.d, z19.d\n"
".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"ble 21f\n"
"ld1b { z17.b }, p5/Z, [x10]\n"
"ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d19828 // ummla z8.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d0982c // ummla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d19829 // ummla z9.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45d0982d // ummla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45d1982a // ummla z10.s, z1.b, z17.b\n"
- "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45d0982e // ummla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x45d1982b // ummla z11.s, z1.b, z17.b\n"
".inst 0x45d0982f // ummla z15.s, z1.b, z16.b\n"
+ "addvl x10, x10, #8\n"
"21:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z17.d, z8.d, z12.d\n"
+ "add x20, x9, x20, LSL #2\n"
+ "uzp1 z16.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z16.d, z9.d, z13.d\n"
+ "uzp1 z17.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z12.d, z10.d, z14.d\n"
+ "st1w { z16.s }, p4, [x9]\n"
+ "uzp1 z16.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x20, x9, x20, LSL #2\n"
- "uzp1 z26.d, z11.d, z15.d\n"
+ "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z2.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z17.s }, p4, [x9]\n"
- "st1w { z16.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z12.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z26.s }, p1, [x9, #3, MUL VL]\n"
+ "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"st1w { z8.s }, p4, [x20]\n"
"st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
@@ -425,7 +425,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"23:" // Height 3
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"24:" // Height 3: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -437,28 +437,28 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z24.s }, p4/Z, [x9]\n"
- "ld1w { z26.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x21, x9, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
- "zip1 z8.d, z24.d, z12.d\n"
- "zip2 z12.d, z24.d, z12.d\n"
- "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z9.d, z26.d, z13.d\n"
- "zip2 z13.d, z26.d, z13.d\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -489,8 +489,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"mov x28, #0x0\n"
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -512,92 +512,92 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z27.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
"ld1rqb { z24.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z26.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "trn1 z6.d, z27.d, z24.d\n"
- "trn2 z27.d, z27.d, z24.d\n"
- "trn1 z30.d, z26.d, z29.d\n"
- "trn2 z26.d, z26.d, z29.d\n"
- ".inst 0x45d998c8 // ummla z8.s, z6.b, z25.b\n"
- ".inst 0x45dc98cc // ummla z12.s, z6.b, z28.b\n"
- ".inst 0x45d99bd0 // ummla z16.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45dc9bd4 // ummla z20.s, z30.b, z28.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45d998c9 // ummla z9.s, z6.b, z25.b\n"
- ".inst 0x45d99bd1 // ummla z17.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45d898cd // ummla z13.s, z6.b, z24.b\n"
- ".inst 0x45d89bd5 // ummla z21.s, z30.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45d998ca // ummla z10.s, z6.b, z25.b\n"
- ".inst 0x45d99bd2 // ummla z18.s, z30.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45d898ce // ummla z14.s, z6.b, z24.b\n"
- ".inst 0x45d89bd6 // ummla z22.s, z30.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- ".inst 0x45d998cb // ummla z11.s, z6.b, z25.b\n"
- ".inst 0x45d99bd3 // ummla z19.s, z30.b, z25.b\n"
- ".inst 0x45d898cf // ummla z15.s, z6.b, z24.b\n"
- ".inst 0x45d89bd7 // ummla z23.s, z30.b, z24.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d99b68 // ummla z8.s, z27.b, z25.b\n"
".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x45d89b6c // ummla z12.s, z27.b, z24.b\n"
".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n"
".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n"
".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sub x27, x27, #0x10\n"
".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n"
".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "cmp x27, #0x10\n"
".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n"
".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
- "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n"
".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n"
".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45d99bc8 // ummla z8.s, z30.b, z25.b\n"
+ ".inst 0x45d99b90 // ummla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45d89bcc // ummla z12.s, z30.b, z24.b\n"
+ ".inst 0x45d89b94 // ummla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45d99bc9 // ummla z9.s, z30.b, z25.b\n"
+ ".inst 0x45d99b91 // ummla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45d89bcd // ummla z13.s, z30.b, z24.b\n"
+ ".inst 0x45d89b95 // ummla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45d99bca // ummla z10.s, z30.b, z25.b\n"
+ ".inst 0x45d99b92 // ummla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45d89bce // ummla z14.s, z30.b, z24.b\n"
+ ".inst 0x45d89b96 // ummla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45d99bcb // ummla z11.s, z30.b, z25.b\n"
+ ".inst 0x45d99b93 // ummla z19.s, z28.b, z25.b\n"
+ ".inst 0x45d89bcf // ummla z15.s, z30.b, z24.b\n"
+ ".inst 0x45d89b97 // ummla z23.s, z28.b, z24.b\n"
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z25.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
"ld1rqb { z24.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
"trn1 z27.d, z1.d, z24.d\n"
"trn2 z1.d, z1.d, z24.d\n"
- "trn1 z26.d, z3.d, z29.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d99b68 // ummla z8.s, z27.b, z25.b\n"
- ".inst 0x45dc9b6c // ummla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z29.d\n"
".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89b6c // ummla z12.s, z27.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45dc9b54 // ummla z20.s, z26.b, z28.b\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n"
".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n"
".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n"
".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
@@ -614,9 +614,9 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d99828 // ummla z8.s, z1.b, z25.b\n"
".inst 0x45d99870 // ummla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d8982c // ummla z12.s, z1.b, z24.b\n"
".inst 0x45d89874 // ummla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d99829 // ummla z9.s, z1.b, z25.b\n"
".inst 0x45d99871 // ummla z17.s, z3.b, z25.b\n"
@@ -641,26 +641,26 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z27.d, z8.d, z12.d\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "uzp1 z25.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z26.d, z9.d, z13.d\n"
+ "uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x21, x9, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
"uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z27.s }, p4, [x9]\n"
"uzp1 z16.d, z16.d, z20.d\n"
- "uzp1 z17.d, z17.d, z21.d\n"
- "st1w { z26.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z18.d, z18.d, z22.d\n"
- "uzp1 z19.d, z19.d, z23.d\n"
- "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
+ "uzp1 z17.d, z17.d, z21.d\n"
+ "uzp1 z18.d, z18.d, z22.d\n"
"st1w { z8.s }, p4, [x21]\n"
+ "uzp1 z19.d, z19.d, z23.d\n"
"st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
@@ -676,7 +676,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"34:" // Height 4
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"35:" // Height 4: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -688,37 +688,37 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x22, x9, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x20]\n"
- "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -745,8 +745,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"mov x28, #0x0\n"
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -771,114 +771,114 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z31.b }, p5/Z, [x10]\n"
- "ld1b { z30.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z29.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
"ld1rqb { z28.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "add x23, x23, #0x10\n"
- "trn1 z27.d, z29.d, z25.d\n"
- "trn2 z29.d, z29.d, z25.d\n"
- "trn1 z26.d, z28.d, z24.d\n"
- "trn2 z28.d, z28.d, z24.d\n"
- ".inst 0x45df9b68 // ummla z8.s, z27.b, z31.b\n"
- ".inst 0x45de9b6c // ummla z12.s, z27.b, z30.b\n"
- ".inst 0x45df9b50 // ummla z16.s, z26.b, z31.b\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99ba8 // ummla z8.s, z29.b, z25.b\n"
+ ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89bac // ummla z12.s, z29.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45de9b54 // ummla z20.s, z26.b, z30.b\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n"
+ ".inst 0x45d99ba9 // ummla z9.s, z29.b, z25.b\n"
".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x45d89bad // ummla z13.s, z29.b, z24.b\n"
".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x45d99baa // ummla z10.s, z29.b, z25.b\n"
".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n"
+ "cmp x27, #0x10\n"
+ ".inst 0x45d89bae // ummla z14.s, z29.b, z24.b\n"
".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n"
+ ".inst 0x45d99bab // ummla z11.s, z29.b, z25.b\n"
".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
- ".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n"
- ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x45d89baf // ummla z15.s, z29.b, z24.b\n"
+ ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45d99ba8 // ummla z8.s, z29.b, z25.b\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45d99bc8 // ummla z8.s, z30.b, z25.b\n"
".inst 0x45d99b90 // ummla z16.s, z28.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45d89bac // ummla z12.s, z29.b, z24.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45d89bcc // ummla z12.s, z30.b, z24.b\n"
".inst 0x45d89b94 // ummla z20.s, z28.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45d99ba9 // ummla z9.s, z29.b, z25.b\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x45d99bc9 // ummla z9.s, z30.b, z25.b\n"
".inst 0x45d99b91 // ummla z17.s, z28.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45d89bad // ummla z13.s, z29.b, z24.b\n"
+ ".inst 0x45d89bcd // ummla z13.s, z30.b, z24.b\n"
".inst 0x45d89b95 // ummla z21.s, z28.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45d99baa // ummla z10.s, z29.b, z25.b\n"
+ ".inst 0x45d99bca // ummla z10.s, z30.b, z25.b\n"
".inst 0x45d99b92 // ummla z18.s, z28.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45d89bae // ummla z14.s, z29.b, z24.b\n"
+ ".inst 0x45d89bce // ummla z14.s, z30.b, z24.b\n"
".inst 0x45d89b96 // ummla z22.s, z28.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45d99bab // ummla z11.s, z29.b, z25.b\n"
+ ".inst 0x45d99bcb // ummla z11.s, z30.b, z25.b\n"
".inst 0x45d99b93 // ummla z19.s, z28.b, z25.b\n"
- ".inst 0x45d89baf // ummla z15.s, z29.b, z24.b\n"
+ ".inst 0x45d89bcf // ummla z15.s, z30.b, z24.b\n"
".inst 0x45d89b97 // ummla z23.s, z28.b, z24.b\n"
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z29.b }, p5/Z, [x10]\n"
- "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z25.b }, p0/Z, [x25]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z24.b }, p0/Z, [x23]\n"
- "trn1 z27.d, z1.d, z25.d\n"
- "trn2 z1.d, z1.d, z25.d\n"
- "trn1 z26.d, z3.d, z24.d\n"
- ".inst 0x45dd9b68 // ummla z8.s, z27.b, z29.b\n"
- ".inst 0x45dc9b6c // ummla z12.s, z27.b, z28.b\n"
- "trn2 z3.d, z3.d, z24.d\n"
- ".inst 0x45dd9b50 // ummla z16.s, z26.b, z29.b\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99b88 // ummla z8.s, z28.b, z25.b\n"
+ ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89b8c // ummla z12.s, z28.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
"ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45dc9b54 // ummla z20.s, z26.b, z28.b\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n"
+ ".inst 0x45d99b89 // ummla z9.s, z28.b, z25.b\n"
".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n"
+ "subs x27, x27, #0x8\n"
+ ".inst 0x45d89b8d // ummla z13.s, z28.b, z24.b\n"
".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x45d99b8a // ummla z10.s, z28.b, z25.b\n"
".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
"ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n"
+ ".inst 0x45d89b8e // ummla z14.s, z28.b, z24.b\n"
".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
"ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n"
+ ".inst 0x45d99b8b // ummla z11.s, z28.b, z25.b\n"
".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
- ".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n"
+ ".inst 0x45d89b8f // ummla z15.s, z28.b, z24.b\n"
".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
"ble 43f\n"
"ld1b { z25.b }, p5/Z, [x10]\n"
"ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45d99828 // ummla z8.s, z1.b, z25.b\n"
".inst 0x45d99870 // ummla z16.s, z3.b, z25.b\n"
- "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45d8982c // ummla z12.s, z1.b, z24.b\n"
".inst 0x45d89874 // ummla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45d99829 // ummla z9.s, z1.b, z25.b\n"
".inst 0x45d99871 // ummla z17.s, z3.b, z25.b\n"
@@ -903,33 +903,33 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp1 z25.d, z8.d, z12.d\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z27.d, z10.d, z14.d\n"
+ "uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x22, x9, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
- "add x20, x21, x20, LSL #2\n"
- "uzp1 z26.d, z11.d, z15.d\n"
+ "uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z25.s }, p4, [x9]\n"
"uzp1 z25.d, z16.d, z20.d\n"
+ "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp1 z24.d, z17.d, z21.d\n"
+ "st1w { z8.s }, p4, [x22]\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z27.s }, p2, [x9, #2, MUL VL]\n"
"uzp1 z21.d, z18.d, z22.d\n"
+ "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z26.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
"uzp1 z20.d, z19.d, z23.d\n"
- "uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x22]\n"
- "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
+ "uzp2 z19.d, z19.d, z23.d\n"
"st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
"st1w { z25.s }, p4, [x21]\n"
"st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
@@ -947,7 +947,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"45:" // Height 5
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
"46:" // Height 5: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -959,46 +959,46 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
- "ld1w { z20.s }, p4/Z, [x21]\n"
- "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x20]\n"
- "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
@@ -1037,8 +1037,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"mov x28, #0x0\n"
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1066,103 +1066,102 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
"ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x25]\n"
"ld1rqb { z7.b }, p0/Z, [x24]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
"trn1 z3.d, z7.d, z2.d\n"
"trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c19888 // ummla z8.s, z4.b, z1.b\n"
+ ".inst 0x45c198a8 // ummla z8.s, z5.b, z1.b\n"
".inst 0x45c19870 // ummla z16.s, z3.b, z1.b\n"
".inst 0x45c19858 // ummla z24.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45c0988c // ummla z12.s, z4.b, z0.b\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x45c098ac // ummla z12.s, z5.b, z0.b\n"
".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x45c0985c // ummla z28.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c19889 // ummla z9.s, z4.b, z1.b\n"
+ ".inst 0x45c198a9 // ummla z9.s, z5.b, z1.b\n"
+ "add x25, x25, #0x10\n"
".inst 0x45c19871 // ummla z17.s, z3.b, z1.b\n"
".inst 0x45c19859 // ummla z25.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c0988d // ummla z13.s, z4.b, z0.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45c098ad // ummla z13.s, z5.b, z0.b\n"
".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x45c0985d // ummla z29.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c1988a // ummla z10.s, z4.b, z1.b\n"
+ ".inst 0x45c198aa // ummla z10.s, z5.b, z1.b\n"
".inst 0x45c19872 // ummla z18.s, z3.b, z1.b\n"
".inst 0x45c1985a // ummla z26.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c0988e // ummla z14.s, z4.b, z0.b\n"
+ ".inst 0x45c098ae // ummla z14.s, z5.b, z0.b\n"
".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
".inst 0x45c0985e // ummla z30.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45c1988b // ummla z11.s, z4.b, z1.b\n"
+ ".inst 0x45c198ab // ummla z11.s, z5.b, z1.b\n"
".inst 0x45c19873 // ummla z19.s, z3.b, z1.b\n"
".inst 0x45c1985b // ummla z27.s, z2.b, z1.b\n"
- ".inst 0x45c0988f // ummla z15.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45c098af // ummla z15.s, z5.b, z0.b\n"
".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n"
".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n"
- "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
"ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n"
".inst 0x45c198f0 // ummla z16.s, z7.b, z1.b\n"
- ".inst 0x45c198b8 // ummla z24.s, z5.b, z1.b\n"
+ ".inst 0x45c19898 // ummla z24.s, z4.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n"
".inst 0x45c098f4 // ummla z20.s, z7.b, z0.b\n"
- ".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n"
+ ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n"
".inst 0x45c198f1 // ummla z17.s, z7.b, z1.b\n"
- ".inst 0x45c198b9 // ummla z25.s, z5.b, z1.b\n"
+ ".inst 0x45c19899 // ummla z25.s, z4.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n"
".inst 0x45c098f5 // ummla z21.s, z7.b, z0.b\n"
- ".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n"
+ ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n"
".inst 0x45c198f2 // ummla z18.s, z7.b, z1.b\n"
- ".inst 0x45c198ba // ummla z26.s, z5.b, z1.b\n"
+ ".inst 0x45c1989a // ummla z26.s, z4.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n"
".inst 0x45c098f6 // ummla z22.s, z7.b, z0.b\n"
- ".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n"
+ ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n"
".inst 0x45c198f3 // ummla z19.s, z7.b, z1.b\n"
- ".inst 0x45c198bb // ummla z27.s, z5.b, z1.b\n"
+ ".inst 0x45c1989b // ummla z27.s, z4.b, z1.b\n"
".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n"
".inst 0x45c098f7 // ummla z23.s, z7.b, z0.b\n"
- ".inst 0x45c098bf // ummla z31.s, z5.b, z0.b\n"
+ ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n"
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
@@ -1170,6 +1169,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
".inst 0x45c298d0 // ummla z16.s, z6.b, z2.b\n"
".inst 0x45c29898 // ummla z24.s, z4.b, z2.b\n"
"ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n"
".inst 0x45c098d4 // ummla z20.s, z6.b, z0.b\n"
".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n"
@@ -1190,8 +1190,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
".inst 0x45c098d6 // ummla z22.s, z6.b, z0.b\n"
".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x45c298eb // ummla z11.s, z7.b, z2.b\n"
+ "addvl x10, x10, #8\n"
".inst 0x45c298d3 // ummla z19.s, z6.b, z2.b\n"
".inst 0x45c2989b // ummla z27.s, z4.b, z2.b\n"
".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n"
@@ -1203,24 +1203,24 @@ void sve_hybrid_u8u32_mmla_6x4VL (
".inst 0x45c29828 // ummla z8.s, z1.b, z2.b\n"
".inst 0x45c29870 // ummla z16.s, z3.b, z2.b\n"
".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45c0982c // ummla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45c29829 // ummla z9.s, z1.b, z2.b\n"
".inst 0x45c29871 // ummla z17.s, z3.b, z2.b\n"
".inst 0x45c298b9 // ummla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45c0982d // ummla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45c2982a // ummla z10.s, z1.b, z2.b\n"
".inst 0x45c29872 // ummla z18.s, z3.b, z2.b\n"
".inst 0x45c298ba // ummla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45c0982e // ummla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
@@ -1237,39 +1237,39 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z1.d, z8.d, z12.d\n"
- "uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z0.d, z9.d, z13.d\n"
- "uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z3.d, z10.d, z14.d\n"
- "uzp2 z10.d, z10.d, z14.d\n"
"add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "uzp1 z2.d, z8.d, z12.d\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "uzp2 z8.d, z8.d, z12.d\n"
+ "uzp1 z1.d, z9.d, z13.d\n"
+ "uzp2 z9.d, z9.d, z13.d\n"
+ "uzp1 z0.d, z10.d, z14.d\n"
+ "st1w { z2.s }, p4, [x9]\n"
+ "uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z2.d, z11.d, z15.d\n"
+ "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z1.s }, p4, [x9]\n"
- "add x20, x21, x20, LSL #2\n"
"uzp1 z1.d, z16.d, z20.d\n"
+ "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z0.s }, p3, [x9, #1, MUL VL]\n"
"uzp1 z0.d, z17.d, z21.d\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z3.s }, p2, [x9, #2, MUL VL]\n"
"uzp1 z21.d, z18.d, z22.d\n"
+ "st1w { z8.s }, p4, [x23]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
"uzp1 z20.d, z19.d, z23.d\n"
+ "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x23]\n"
"uzp1 z24.d, z24.d, z28.d\n"
+ "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
"uzp1 z25.d, z25.d, z29.d\n"
- "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
"uzp1 z26.d, z26.d, z30.d\n"
- "uzp1 z27.d, z27.d, z31.d\n"
- "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+ "uzp1 z27.d, z27.d, z31.d\n"
"st1w { z1.s }, p4, [x22]\n"
"st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
"st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
@@ -1289,12 +1289,11 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"b 68f\n"
"56:" // Height 6
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
"mov x20, #0x18\n"
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "madd x20, x21, x20, x9\n"
- "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+ "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"57:" // Height 6: Column loop
"mov x20, #0x0\n"
"whilelt p4.s, x20, x11\n"
@@ -1306,54 +1305,54 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z20.s }, p4/Z, [x9]\n"
- "ld1w { z22.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
+ "zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "zip1 z8.d, z20.d, z12.d\n"
- "zip2 z12.d, z20.d, z12.d\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "zip1 z9.d, z22.d, z13.d\n"
- "zip2 z13.d, z22.d, z13.d\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "zip1 z10.d, z25.d, z14.d\n"
- "zip2 z14.d, z25.d, z14.d\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z28.s }, p4/Z, [x20]\n"
- "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
@@ -1389,8 +1388,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"mov x28, #0x0\n"
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
"add x20, x20, x21, LSL #3\n"
@@ -1421,113 +1420,113 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1b { z1.b }, p5/Z, [x10]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "ld1rqb { z6.b }, p0/Z, [x26]\n"
- "add x26, x26, #0x10\n"
- "ld1rqb { z3.b }, p0/Z, [x25]\n"
- "add x25, x25, #0x10\n"
- "ld1rqb { z7.b }, p0/Z, [x24]\n"
- "add x24, x24, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
"ld1rqb { z0.b }, p0/Z, [x21]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "trn1 z4.d, z6.d, z3.d\n"
- "trn2 z6.d, z6.d, z3.d\n"
- "add x21, x21, #0x10\n"
- "trn1 z3.d, z7.d, z2.d\n"
- "trn2 z7.d, z7.d, z2.d\n"
- "trn1 z2.d, z5.d, z0.d\n"
- "trn2 z5.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c19888 // ummla z8.s, z4.b, z1.b\n"
- ".inst 0x45c19870 // ummla z16.s, z3.b, z1.b\n"
+ ".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n"
+ ".inst 0x45c19890 // ummla z16.s, z4.b, z1.b\n"
".inst 0x45c19858 // ummla z24.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45c0988c // ummla z12.s, z4.b, z0.b\n"
- ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
+ "sub x27, x27, #0x10\n"
+ ".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n"
+ ".inst 0x45c09894 // ummla z20.s, z4.b, z0.b\n"
+ "cmp x27, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x45c0985c // ummla z28.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c19889 // ummla z9.s, z4.b, z1.b\n"
- ".inst 0x45c19871 // ummla z17.s, z3.b, z1.b\n"
+ ".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x45c19891 // ummla z17.s, z4.b, z1.b\n"
".inst 0x45c19859 // ummla z25.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c0988d // ummla z13.s, z4.b, z0.b\n"
- ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n"
+ ".inst 0x45c09895 // ummla z21.s, z4.b, z0.b\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x45c0985d // ummla z29.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c1988a // ummla z10.s, z4.b, z1.b\n"
- ".inst 0x45c19872 // ummla z18.s, z3.b, z1.b\n"
+ ".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x45c19892 // ummla z18.s, z4.b, z1.b\n"
".inst 0x45c1985a // ummla z26.s, z2.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c0988e // ummla z14.s, z4.b, z0.b\n"
- ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
+ ".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n"
+ ".inst 0x45c09896 // ummla z22.s, z4.b, z0.b\n"
".inst 0x45c0985e // ummla z30.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45c1988b // ummla z11.s, z4.b, z1.b\n"
- ".inst 0x45c19873 // ummla z19.s, z3.b, z1.b\n"
+ ".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n"
+ ".inst 0x45c19893 // ummla z19.s, z4.b, z1.b\n"
".inst 0x45c1985b // ummla z27.s, z2.b, z1.b\n"
- ".inst 0x45c0988f // ummla z15.s, z4.b, z0.b\n"
- ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n"
- ".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n"
+ ".inst 0x45c09897 // ummla z23.s, z4.b, z0.b\n"
+ ".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n"
- ".inst 0x45c198f0 // ummla z16.s, z7.b, z1.b\n"
- ".inst 0x45c198b8 // ummla z24.s, z5.b, z1.b\n"
+ ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n"
+ ".inst 0x45c198b0 // ummla z16.s, z5.b, z1.b\n"
+ ".inst 0x45c19878 // ummla z24.s, z3.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n"
- ".inst 0x45c098f4 // ummla z20.s, z7.b, z0.b\n"
- ".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n"
+ ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n"
+ ".inst 0x45c098b4 // ummla z20.s, z5.b, z0.b\n"
+ ".inst 0x45c0987c // ummla z28.s, z3.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n"
- ".inst 0x45c198f1 // ummla z17.s, z7.b, z1.b\n"
- ".inst 0x45c198b9 // ummla z25.s, z5.b, z1.b\n"
+ ".inst 0x45c198e9 // ummla z9.s, z7.b, z1.b\n"
+ ".inst 0x45c198b1 // ummla z17.s, z5.b, z1.b\n"
+ ".inst 0x45c19879 // ummla z25.s, z3.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n"
- ".inst 0x45c098f5 // ummla z21.s, z7.b, z0.b\n"
- ".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n"
+ ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n"
+ ".inst 0x45c098b5 // ummla z21.s, z5.b, z0.b\n"
+ ".inst 0x45c0987d // ummla z29.s, z3.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n"
- ".inst 0x45c198f2 // ummla z18.s, z7.b, z1.b\n"
- ".inst 0x45c198ba // ummla z26.s, z5.b, z1.b\n"
+ ".inst 0x45c198ea // ummla z10.s, z7.b, z1.b\n"
+ ".inst 0x45c198b2 // ummla z18.s, z5.b, z1.b\n"
+ ".inst 0x45c1987a // ummla z26.s, z3.b, z1.b\n"
"ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n"
- ".inst 0x45c098f6 // ummla z22.s, z7.b, z0.b\n"
- ".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n"
+ ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n"
+ ".inst 0x45c098b6 // ummla z22.s, z5.b, z0.b\n"
+ ".inst 0x45c0987e // ummla z30.s, z3.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n"
- ".inst 0x45c198f3 // ummla z19.s, z7.b, z1.b\n"
- ".inst 0x45c198bb // ummla z27.s, z5.b, z1.b\n"
- ".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n"
- ".inst 0x45c098f7 // ummla z23.s, z7.b, z0.b\n"
- ".inst 0x45c098bf // ummla z31.s, z5.b, z0.b\n"
+ ".inst 0x45c198eb // ummla z11.s, z7.b, z1.b\n"
+ ".inst 0x45c198b3 // ummla z19.s, z5.b, z1.b\n"
+ ".inst 0x45c1987b // ummla z27.s, z3.b, z1.b\n"
+ ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n"
+ ".inst 0x45c098b7 // ummla z23.s, z5.b, z0.b\n"
+ ".inst 0x45c0987f // ummla z31.s, z3.b, z0.b\n"
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
- "ld1b { z2.b }, p5/Z, [x10]\n"
- "subs x27, x27, #0x8\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z6.b }, p0/Z, [x25]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
"ld1rqb { z0.b }, p0/Z, [x21]\n"
- "trn1 z7.d, z1.d, z6.d\n"
- "trn2 z1.d, z1.d, z6.d\n"
- "trn1 z6.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
"trn1 z4.d, z5.d, z0.d\n"
"trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
"ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x45c298e8 // ummla z8.s, z7.b, z2.b\n"
".inst 0x45c298d0 // ummla z16.s, z6.b, z2.b\n"
".inst 0x45c29898 // ummla z24.s, z4.b, z2.b\n"
"ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "subs x27, x27, #0x8\n"
".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n"
".inst 0x45c098d4 // ummla z20.s, z6.b, z0.b\n"
".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n"
@@ -1548,8 +1547,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
".inst 0x45c098d6 // ummla z22.s, z6.b, z0.b\n"
".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #8\n"
".inst 0x45c298eb // ummla z11.s, z7.b, z2.b\n"
+ "addvl x10, x10, #8\n"
".inst 0x45c298d3 // ummla z19.s, z6.b, z2.b\n"
".inst 0x45c2989b // ummla z27.s, z4.b, z2.b\n"
".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n"
@@ -1561,24 +1560,24 @@ void sve_hybrid_u8u32_mmla_6x4VL (
".inst 0x45c29828 // ummla z8.s, z1.b, z2.b\n"
".inst 0x45c29870 // ummla z16.s, z3.b, z2.b\n"
".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45c0982c // ummla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x45c29829 // ummla z9.s, z1.b, z2.b\n"
".inst 0x45c29871 // ummla z17.s, z3.b, z2.b\n"
".inst 0x45c298b9 // ummla z25.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45c0982d // ummla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x45c2982a // ummla z10.s, z1.b, z2.b\n"
".inst 0x45c29872 // ummla z18.s, z3.b, z2.b\n"
".inst 0x45c298ba // ummla z26.s, z5.b, z2.b\n"
- "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45c0982e // ummla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n"
"ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
@@ -1595,46 +1594,46 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 60b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x9, x20, LSL #2\n"
+ "add x23, x24, x20, LSL #2\n"
"uzp1 z0.d, z8.d, z12.d\n"
+ "add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
+ "st1w { z0.s }, p4, [x9]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
"uzp1 z14.d, z11.d, z15.d\n"
+ "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z0.s }, p4, [x9]\n"
- "add x21, x22, x20, LSL #2\n"
"uzp1 z15.d, z16.d, z20.d\n"
+ "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
- "add x20, x21, x20, LSL #2\n"
"uzp1 z20.d, z17.d, z21.d\n"
+ "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
"uzp1 z21.d, z18.d, z22.d\n"
+ "st1w { z8.s }, p4, [x24]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
"uzp1 z22.d, z19.d, z23.d\n"
+ "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z8.s }, p4, [x24]\n"
"uzp1 z23.d, z24.d, z28.d\n"
+ "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
"uzp2 z24.d, z24.d, z28.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
"uzp1 z28.d, z25.d, z29.d\n"
+ "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
"uzp2 z25.d, z25.d, z29.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
"uzp1 z29.d, z26.d, z30.d\n"
+ "st1w { z15.s }, p4, [x23]\n"
"uzp2 z26.d, z26.d, z30.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
"uzp1 z30.d, z27.d, z31.d\n"
- "uzp2 z27.d, z27.d, z31.d\n"
- "st1w { z15.s }, p4, [x23]\n"
"st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+ "uzp2 z27.d, z27.d, z31.d\n"
"st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
"st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
"st1w { z16.s }, p4, [x22]\n"
@@ -1665,8 +1664,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
- : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
- : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index f9041edcca..ce3b070052 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -40,8 +40,7 @@ void sve_interleaved_bf16fp32_dot_8x3VL( ARGLIST );
class cls_sve_interleaved_bf16fp32_dot_8x3VL
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 2, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 2, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 0646fa02eb..a6eefc1006 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -55,25 +55,25 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
+ "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z10.b, #0x0\n"
- "ld1h { z4.h }, p0/Z, [x22]\n"
"mov z11.b, #0x0\n"
+ "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z12.b, #0x0\n"
- "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.b, #0x0\n"
+ "ld1h { z4.h }, p0/Z, [x22]\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
+ "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.b, #0x0\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z17.b, #0x0\n"
+ "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
"mov z18.b, #0x0\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.b, #0x0\n"
"mov z20.b, #0x0\n"
- "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
"mov z23.b, #0x0\n"
@@ -151,12 +151,12 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
"ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "addvl x22, x22, #3\n"
".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n"
".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n"
+ "addvl x22, x22, #3\n"
".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n"
".inst 0x6471409a // bfdot z26.s, z4.h, z1.h[2]\n"
@@ -183,13 +183,13 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ld1h { z2.h }, p0/Z, [x22]\n"
"ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
".inst 0x64644048 // bfdot z8.s, z2.h, z4.h[0]\n"
+ "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
".inst 0x646c404b // bfdot z11.s, z2.h, z4.h[1]\n"
".inst 0x6474404e // bfdot z14.s, z2.h, z4.h[2]\n"
".inst 0x647c4051 // bfdot z17.s, z2.h, z4.h[3]\n"
".inst 0x64634054 // bfdot z20.s, z2.h, z3.h[0]\n"
+ "addvl x22, x22, #3\n"
".inst 0x646b4057 // bfdot z23.s, z2.h, z3.h[1]\n"
".inst 0x6473405a // bfdot z26.s, z2.h, z3.h[2]\n"
".inst 0x647b405d // bfdot z29.s, z2.h, z3.h[3]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index 5c0d59609b..42a7dbb37a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -40,8 +40,7 @@ void sve_interleaved_bf16fp32_mmla_8x3VL( ARGLIST );
class cls_sve_interleaved_bf16fp32_mmla_8x3VL
{
public:
- typedef bfloat16 lhs_operand_type;
- typedef bfloat16 rhs_operand_type;
+ typedef bfloat16 operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 4, 2> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 9cde63f9d7..3ffec98c16 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -55,31 +55,31 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
- "mov z10.b, #0x0\n"
"ld1h { z4.h }, p0/Z, [x22]\n"
+ "mov z10.b, #0x0\n"
"mov z11.b, #0x0\n"
+ "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z12.b, #0x0\n"
- "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.b, #0x0\n"
+ "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
+ "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.b, #0x0\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z17.b, #0x0\n"
+ "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
"mov z18.b, #0x0\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.b, #0x0\n"
+ "addvl x22, x22, #2\n"
"mov z20.b, #0x0\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
"mov z21.b, #0x0\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z22.b, #0x0\n"
- "addvl x22, x22, #2\n"
"mov z23.b, #0x0\n"
"mov z24.b, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
@@ -94,77 +94,77 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
- "ld1h { z3.h }, p0/Z, [x22]\n"
+ "ld1h { z7.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
".inst 0x6464e4da // bfmmla z26.s, z6.h, z4.h\n"
".inst 0x6465e4dd // bfmmla z29.s, z6.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x6463e409 // bfmmla z9.s, z0.h, z3.h\n"
+ "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
+ ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
"sub x20, x20, #0x2\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- ".inst 0x6463e42f // bfmmla z15.s, z1.h, z3.h\n"
+ ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
"cmp x20, #0x2\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
- ".inst 0x6463e455 // bfmmla z21.s, z2.h, z3.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
- ".inst 0x6463e4db // bfmmla z27.s, z6.h, z3.h\n"
+ ".inst 0x6467e4db // bfmmla z27.s, z6.h, z7.h\n"
+ ".inst 0x6463e4de // bfmmla z30.s, z6.h, z3.h\n"
"ld1h { z3.h }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x6467e4de // bfmmla z30.s, z6.h, z7.h\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
- ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
+ ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
+ ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
+ ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
+ ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
- ".inst 0x6464e4dc // bfmmla z28.s, z6.h, z4.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x6465e4df // bfmmla z31.s, z6.h, z5.h\n"
+ ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
+ ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
+ "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x6465e4dc // bfmmla z28.s, z6.h, z5.h\n"
+ ".inst 0x6464e4df // bfmmla z31.s, z6.h, z4.h\n"
+ "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
"ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
- "ld1h { z4.h }, p0/Z, [x22, #6, MUL VL]\n"
- "ld1h { z5.h }, p0/Z, [x22, #7, MUL VL]\n"
- "addvl x22, x22, #16\n"
+ "ld1h { z2.h }, p0/Z, [x22, #6, MUL VL]\n"
".inst 0x6463e408 // bfmmla z8.s, z0.h, z3.h\n"
+ "ld1h { z4.h }, p0/Z, [x22, #7, MUL VL]\n"
+ "addvl x22, x22, #16\n"
".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
".inst 0x6463e42e // bfmmla z14.s, z1.h, z3.h\n"
".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
- ".inst 0x6463e454 // bfmmla z20.s, z2.h, z3.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b7 // bfmmla z23.s, z5.h, z7.h\n"
".inst 0x6463e4da // bfmmla z26.s, z6.h, z3.h\n"
- "ld1h { z3.h }, p0/Z, [x22, #-8, MUL VL]\n"
".inst 0x6467e4dd // bfmmla z29.s, z6.h, z7.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #-8, MUL VL]\n"
"ld1h { z7.h }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n"
- ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n"
- ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n"
- ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n"
- ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n"
- ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n"
- ".inst 0x6464e4db // bfmmla z27.s, z6.h, z4.h\n"
+ ".inst 0x6462e409 // bfmmla z9.s, z0.h, z2.h\n"
+ ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6462e42f // bfmmla z15.s, z1.h, z2.h\n"
+ ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
+ ".inst 0x6462e4b5 // bfmmla z21.s, z5.h, z2.h\n"
+ ".inst 0x6464e4b8 // bfmmla z24.s, z5.h, z4.h\n"
+ ".inst 0x6462e4db // bfmmla z27.s, z6.h, z2.h\n"
+ ".inst 0x6464e4de // bfmmla z30.s, z6.h, z4.h\n"
"ld1h { z4.h }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x6465e4de // bfmmla z30.s, z6.h, z5.h\n"
".inst 0x6463e40a // bfmmla z10.s, z0.h, z3.h\n"
- "ld1h { z5.h }, p0/Z, [x22, #-5, MUL VL]\n"
".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
+ ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n"
".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
- ".inst 0x6463e456 // bfmmla z22.s, z2.h, z3.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
+ "ld1h { z5.h }, p0/Z, [x22, #-5, MUL VL]\n"
".inst 0x6463e4dc // bfmmla z28.s, z6.h, z3.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
".inst 0x6467e4df // bfmmla z31.s, z6.h, z7.h\n"
+ "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x22, x22, #-4\n"
"bge 3b\n"
"4:" // main loop skip
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
@@ -172,54 +172,54 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
"ld1h { z6.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6464e4fa // bfmmla z26.s, z7.h, z4.h\n"
+ ".inst 0x6465e4fd // bfmmla z29.s, z7.h, z5.h\n"
"ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
"ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- "addvl x22, x22, #4\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
- ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n"
- ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0x6466e4fb // bfmmla z27.s, z7.h, z6.h\n"
+ ".inst 0x6463e4fe // bfmmla z30.s, z7.h, z3.h\n"
".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
- ".inst 0x6465e47c // bfmmla z28.s, z3.h, z5.h\n"
- ".inst 0x6464e47f // bfmmla z31.s, z3.h, z4.h\n"
+ ".inst 0x6465e4fc // bfmmla z28.s, z7.h, z5.h\n"
+ ".inst 0x6464e4ff // bfmmla z31.s, z7.h, z4.h\n"
"cbz x20, 5f\n"
"ld1h { z1.h }, p0/Z, [x22]\n"
"ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
"ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
"ld1h { z0.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n"
"ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
"ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
- "ld1h { z3.h }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1h { z2.h }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n"
".inst 0x6461e4ce // bfmmla z14.s, z6.h, z1.h\n"
".inst 0x6460e4d1 // bfmmla z17.s, z6.h, z0.h\n"
".inst 0x6461e4b4 // bfmmla z20.s, z5.h, z1.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #2, MUL VL]\n"
".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
- "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1h { z2.h }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
"ld1h { z0.h }, p0/Z, [x22, #5, MUL VL]\n"
".inst 0x6463e4e9 // bfmmla z9.s, z7.h, z3.h\n"
".inst 0x6462e4ec // bfmmla z12.s, z7.h, z2.h\n"
- ".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n"
"addvl x22, x22, #6\n"
+ ".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n"
".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x6463e4b5 // bfmmla z21.s, z5.h, z3.h\n"
".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
".inst 0x6463e49b // bfmmla z27.s, z4.h, z3.h\n"
@@ -233,53 +233,53 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
".inst 0x6461e49c // bfmmla z28.s, z4.h, z1.h\n"
".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"5:" // multiply loop done
- "uzp1 z2.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "subs x23, x23, #0x1\n"
- "uzp1 z1.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z3.d, z14.d, z17.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "uzp1 z0.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
+ "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
+ "subs x23, x23, #0x1\n"
+ "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "uzp1 z2.d, z16.d, z19.d\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z1.d, z20.d, z23.d\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z0.d, z21.d, z24.d\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "uzp1 z23.d, z21.d, z24.d\n"
"uzp2 z21.d, z21.d, z24.d\n"
- "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "uzp1 z23.d, z22.d, z25.d\n"
+ "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
- "st1w { z3.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "uzp1 z19.d, z26.d, z29.d\n"
+ "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #16\n"
- "uzp1 z18.d, z27.d, z30.d\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
- "uzp1 z17.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
- "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z23.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 292ac1760e..e04e3d2a1a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -40,8 +40,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx( ARGLIST );
class cls_sve_interleaved_fp16_mla_8x3VL
{
public:
- typedef __fp16 lhs_operand_type;
- typedef __fp16 rhs_operand_type;
+ typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
index 360c61f0b4..6e19811d72 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -54,31 +54,31 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
- "mov z10.b, #0x0\n"
"ld1h { z0.h }, p0/Z, [x22]\n"
+ "mov z10.b, #0x0\n"
"mov z11.b, #0x0\n"
- "mov z12.b, #0x0\n"
"ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
+ "mov z12.b, #0x0\n"
"mov z13.b, #0x0\n"
+ "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
"mov z16.b, #0x0\n"
- "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
"mov z17.b, #0x0\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
"mov z18.b, #0x0\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
"mov z19.b, #0x0\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
"mov z20.b, #0x0\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
"mov z21.b, #0x0\n"
+ "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
"mov z22.b, #0x0\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
"mov z23.b, #0x0\n"
"mov z24.b, #0x0\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
@@ -92,7 +92,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"fmla z9.h, p0/M, z1.h, z3.h\n"
"sub x20, x20, #0x2\n"
"fmla z10.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z11.h, p0/M, z0.h, z4.h\n"
"fmla z12.h, p0/M, z1.h, z4.h\n"
"fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -101,63 +101,63 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"fmla z15.h, p0/M, z1.h, z5.h\n"
"cmp x20, #0x2\n"
"fmla z16.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z7.h }, p0/Z, [%x[Apanel], #12]\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #12]\n"
"fmla z17.h, p0/M, z0.h, z6.h\n"
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #14]\n"
+ "fmla z20.h, p0/M, z0.h, z7.h\n"
+ "fmla z21.h, p0/M, z1.h, z7.h\n"
+ "fmla z22.h, p0/M, z2.h, z7.h\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
"fmla z23.h, p0/M, z0.h, z4.h\n"
"fmla z24.h, p0/M, z1.h, z4.h\n"
"fmla z25.h, p0/M, z2.h, z4.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
- "fmla z26.h, p0/M, z0.h, z7.h\n"
- "fmla z27.h, p0/M, z1.h, z7.h\n"
- "fmla z28.h, p0/M, z2.h, z7.h\n"
+ "fmla z26.h, p0/M, z0.h, z3.h\n"
+ "fmla z27.h, p0/M, z1.h, z3.h\n"
+ "fmla z28.h, p0/M, z2.h, z3.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel], #20]\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #3, MUL VL]\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
- "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
- "addvl x22, x22, #6\n"
+ "fmla z29.h, p0/M, z0.h, z5.h\n"
+ "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "fmla z30.h, p0/M, z1.h, z5.h\n"
+ "fmla z31.h, p0/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1h { z5.h }, p0/Z, [x22, #5, MUL VL]\n"
+ "fmla z8.h, p0/M, z6.h, z7.h\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
- "fmla z8.h, p0/M, z7.h, z5.h\n"
- "fmla z11.h, p0/M, z7.h, z4.h\n"
- "fmla z9.h, p0/M, z6.h, z5.h\n"
- "fmla z12.h, p0/M, z6.h, z4.h\n"
- "fmla z10.h, p0/M, z2.h, z5.h\n"
- "fmla z13.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #24]\n"
- "fmla z14.h, p0/M, z7.h, z3.h\n"
- "fmla z15.h, p0/M, z6.h, z3.h\n"
+ "fmla z9.h, p0/M, z2.h, z7.h\n"
+ "fmla z10.h, p0/M, z5.h, z7.h\n"
+ "fmla z11.h, p0/M, z6.h, z4.h\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #24]\n"
+ "fmla z12.h, p0/M, z2.h, z4.h\n"
+ "fmla z13.h, p0/M, z5.h, z4.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
- "fmla z16.h, p0/M, z2.h, z3.h\n"
- "fmla z17.h, p0/M, z7.h, z1.h\n"
+ "fmla z14.h, p0/M, z6.h, z3.h\n"
+ "fmla z15.h, p0/M, z2.h, z3.h\n"
+ "addvl x22, x22, #6\n"
+ "fmla z16.h, p0/M, z5.h, z3.h\n"
"ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z18.h, p0/M, z6.h, z1.h\n"
- "fmla z19.h, p0/M, z2.h, z1.h\n"
+ "fmla z17.h, p0/M, z6.h, z1.h\n"
+ "fmla z18.h, p0/M, z2.h, z1.h\n"
+ "fmla z19.h, p0/M, z5.h, z1.h\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z20.h, p0/M, z7.h, z5.h\n"
- "fmla z21.h, p0/M, z6.h, z5.h\n"
- "fmla z22.h, p0/M, z2.h, z5.h\n"
- "fmla z23.h, p0/M, z7.h, z4.h\n"
+ "fmla z20.h, p0/M, z6.h, z7.h\n"
+ "fmla z21.h, p0/M, z2.h, z7.h\n"
+ "fmla z22.h, p0/M, z5.h, z7.h\n"
+ "fmla z23.h, p0/M, z6.h, z4.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z24.h, p0/M, z6.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
+ "fmla z24.h, p0/M, z2.h, z4.h\n"
+ "fmla z25.h, p0/M, z5.h, z4.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
- "fmla z26.h, p0/M, z7.h, z0.h\n"
- "fmla z27.h, p0/M, z6.h, z0.h\n"
- "fmla z28.h, p0/M, z2.h, z0.h\n"
- "fmla z29.h, p0/M, z7.h, z1.h\n"
+ "fmla z26.h, p0/M, z6.h, z0.h\n"
+ "fmla z27.h, p0/M, z2.h, z0.h\n"
+ "fmla z28.h, p0/M, z5.h, z0.h\n"
+ "fmla z29.h, p0/M, z6.h, z1.h\n"
"ld1h { z0.h }, p0/Z, [x22]\n"
- "fmla z30.h, p0/M, z6.h, z1.h\n"
- "fmla z31.h, p0/M, z2.h, z1.h\n"
+ "fmla z30.h, p0/M, z2.h, z1.h\n"
+ "fmla z31.h, p0/M, z5.h, z1.h\n"
"ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
"ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
@@ -199,20 +199,19 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
"ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "addvl x22, x22, #3\n"
+ "fmla z8.h, p0/M, z6.h, z3.h\n"
"ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
- "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
- "fmla z8.h, p0/M, z6.h, z3.h\n"
"fmla z9.h, p0/M, z5.h, z3.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
"fmla z10.h, p0/M, z4.h, z3.h\n"
"fmla z11.h, p0/M, z6.h, z2.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z12.h, p0/M, z5.h, z2.h\n"
"fmla z13.h, p0/M, z4.h, z2.h\n"
- "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z14.h, p0/M, z6.h, z1.h\n"
"fmla z15.h, p0/M, z5.h, z1.h\n"
+ "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
"fmla z16.h, p0/M, z4.h, z1.h\n"
"fmla z17.h, p0/M, z6.h, z0.h\n"
"ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
@@ -221,9 +220,10 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
"fmla z20.h, p0/M, z6.h, z3.h\n"
"fmla z21.h, p0/M, z5.h, z3.h\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
+ "addvl x22, x22, #3\n"
"fmla z22.h, p0/M, z4.h, z3.h\n"
"fmla z23.h, p0/M, z6.h, z2.h\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z24.h, p0/M, z5.h, z2.h\n"
"fmla z25.h, p0/M, z4.h, z2.h\n"
"fmla z26.h, p0/M, z6.h, z1.h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index 09180c8f36..865d011ac1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -54,22 +54,22 @@ void sve_interleaved_fp16_mla_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
+ "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z10.b, #0x0\n"
- "ld1h { z2.h }, p0/Z, [x22]\n"
"mov z11.b, #0x0\n"
+ "ld1h { z2.h }, p0/Z, [x22]\n"
"mov z12.b, #0x0\n"
- "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.b, #0x0\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
+ "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
"mov z16.b, #0x0\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z17.b, #0x0\n"
"mov z18.b, #0x0\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
"mov z19.b, #0x0\n"
"mov z20.b, #0x0\n"
"mov z21.b, #0x0\n"
@@ -147,12 +147,12 @@ void sve_interleaved_fp16_mla_8x3VL(
"fmla z31.h, z1.h, z7.h[7]\n"
"bge 3b\n"
"4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "addvl x22, x22, #3\n"
"fmla z8.h, z2.h, z0.h[0]\n"
"fmla z11.h, z2.h, z0.h[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z14.h, z2.h, z0.h[2]\n"
"fmla z17.h, z2.h, z0.h[3]\n"
+ "addvl x22, x22, #3\n"
"fmla z20.h, z2.h, z0.h[4]\n"
"fmla z23.h, z2.h, z0.h[5]\n"
"fmla z26.h, z2.h, z0.h[6]\n"
@@ -176,16 +176,16 @@ void sve_interleaved_fp16_mla_8x3VL(
"cbz x20, 5f\n"
"ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
"ld1h { z2.h }, p0/Z, [x22]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
+ "fmla z8.h, z2.h, z3.h[0]\n"
"ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
"ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
- "fmla z8.h, z2.h, z3.h[0]\n"
"fmla z11.h, z2.h, z3.h[1]\n"
"fmla z14.h, z2.h, z3.h[2]\n"
"fmla z17.h, z2.h, z3.h[3]\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z20.h, z2.h, z3.h[4]\n"
"fmla z23.h, z2.h, z3.h[5]\n"
+ "addvl x22, x22, #3\n"
"fmla z26.h, z2.h, z3.h[6]\n"
"fmla z29.h, z2.h, z3.h[7]\n"
"fmla z9.h, z1.h, z3.h[0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index 89d65083f4..000866346f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -40,8 +40,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx( ARGLIST );
class cls_sve_interleaved_fp32_mla_8x3VL
{
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
index 0006fddb2a..ee9f58f811 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -54,31 +54,31 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
- "mov z10.b, #0x0\n"
"ld1w { z0.s }, p0/Z, [x22]\n"
+ "mov z10.b, #0x0\n"
"mov z11.b, #0x0\n"
- "mov z12.b, #0x0\n"
"ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
+ "mov z12.b, #0x0\n"
"mov z13.b, #0x0\n"
+ "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z16.b, #0x0\n"
- "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
"mov z17.b, #0x0\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z18.b, #0x0\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z19.b, #0x0\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z20.b, #0x0\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z21.b, #0x0\n"
+ "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z22.b, #0x0\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z23.b, #0x0\n"
"mov z24.b, #0x0\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
@@ -92,7 +92,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"fmla z9.s, p0/M, z1.s, z3.s\n"
"sub x20, x20, #0x2\n"
"fmla z10.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z11.s, p0/M, z0.s, z4.s\n"
"fmla z12.s, p0/M, z1.s, z4.s\n"
"fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -101,63 +101,63 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"fmla z15.s, p0/M, z1.s, z5.s\n"
"cmp x20, #0x2\n"
"fmla z16.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z7.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
"fmla z17.s, p0/M, z0.s, z6.s\n"
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z20.s, p0/M, z0.s, z7.s\n"
+ "fmla z21.s, p0/M, z1.s, z7.s\n"
+ "fmla z22.s, p0/M, z2.s, z7.s\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
"fmla z23.s, p0/M, z0.s, z4.s\n"
"fmla z24.s, p0/M, z1.s, z4.s\n"
"fmla z25.s, p0/M, z2.s, z4.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "fmla z26.s, p0/M, z0.s, z7.s\n"
- "fmla z27.s, p0/M, z1.s, z7.s\n"
- "fmla z28.s, p0/M, z2.s, z7.s\n"
+ "fmla z26.s, p0/M, z0.s, z3.s\n"
+ "fmla z27.s, p0/M, z1.s, z3.s\n"
+ "fmla z28.s, p0/M, z2.s, z3.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "ld1w { z7.s }, p0/Z, [x22, #3, MUL VL]\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
- "ld1w { z6.s }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
- "addvl x22, x22, #6\n"
+ "fmla z29.s, p0/M, z0.s, z5.s\n"
+ "ld1w { z6.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "fmla z30.s, p0/M, z1.s, z5.s\n"
+ "fmla z31.s, p0/M, z2.s, z5.s\n"
+ "ld1w { z2.s }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z5.s }, p0/Z, [x22, #5, MUL VL]\n"
+ "fmla z8.s, p0/M, z6.s, z7.s\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
- "fmla z8.s, p0/M, z7.s, z5.s\n"
- "fmla z11.s, p0/M, z7.s, z4.s\n"
- "fmla z9.s, p0/M, z6.s, z5.s\n"
- "fmla z12.s, p0/M, z6.s, z4.s\n"
- "fmla z10.s, p0/M, z2.s, z5.s\n"
- "fmla z13.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #48]\n"
- "fmla z14.s, p0/M, z7.s, z3.s\n"
- "fmla z15.s, p0/M, z6.s, z3.s\n"
+ "fmla z9.s, p0/M, z2.s, z7.s\n"
+ "fmla z10.s, p0/M, z5.s, z7.s\n"
+ "fmla z11.s, p0/M, z6.s, z4.s\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+ "fmla z12.s, p0/M, z2.s, z4.s\n"
+ "fmla z13.s, p0/M, z5.s, z4.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "fmla z16.s, p0/M, z2.s, z3.s\n"
- "fmla z17.s, p0/M, z7.s, z1.s\n"
+ "fmla z14.s, p0/M, z6.s, z3.s\n"
+ "fmla z15.s, p0/M, z2.s, z3.s\n"
+ "addvl x22, x22, #6\n"
+ "fmla z16.s, p0/M, z5.s, z3.s\n"
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
- "fmla z18.s, p0/M, z6.s, z1.s\n"
- "fmla z19.s, p0/M, z2.s, z1.s\n"
+ "fmla z17.s, p0/M, z6.s, z1.s\n"
+ "fmla z18.s, p0/M, z2.s, z1.s\n"
+ "fmla z19.s, p0/M, z5.s, z1.s\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "fmla z20.s, p0/M, z7.s, z5.s\n"
- "fmla z21.s, p0/M, z6.s, z5.s\n"
- "fmla z22.s, p0/M, z2.s, z5.s\n"
- "fmla z23.s, p0/M, z7.s, z4.s\n"
+ "fmla z20.s, p0/M, z6.s, z7.s\n"
+ "fmla z21.s, p0/M, z2.s, z7.s\n"
+ "fmla z22.s, p0/M, z5.s, z7.s\n"
+ "fmla z23.s, p0/M, z6.s, z4.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "fmla z24.s, p0/M, z6.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
+ "fmla z24.s, p0/M, z2.s, z4.s\n"
+ "fmla z25.s, p0/M, z5.s, z4.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "fmla z26.s, p0/M, z7.s, z0.s\n"
- "fmla z27.s, p0/M, z6.s, z0.s\n"
- "fmla z28.s, p0/M, z2.s, z0.s\n"
- "fmla z29.s, p0/M, z7.s, z1.s\n"
+ "fmla z26.s, p0/M, z6.s, z0.s\n"
+ "fmla z27.s, p0/M, z2.s, z0.s\n"
+ "fmla z28.s, p0/M, z5.s, z0.s\n"
+ "fmla z29.s, p0/M, z6.s, z1.s\n"
"ld1w { z0.s }, p0/Z, [x22]\n"
- "fmla z30.s, p0/M, z6.s, z1.s\n"
- "fmla z31.s, p0/M, z2.s, z1.s\n"
+ "fmla z30.s, p0/M, z2.s, z1.s\n"
+ "fmla z31.s, p0/M, z5.s, z1.s\n"
"ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
"ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -199,20 +199,19 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
"ld1w { z4.s }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "addvl x22, x22, #3\n"
+ "fmla z8.s, p0/M, z6.s, z3.s\n"
"ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
- "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
- "fmla z8.s, p0/M, z6.s, z3.s\n"
"fmla z9.s, p0/M, z5.s, z3.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
"fmla z10.s, p0/M, z4.s, z3.s\n"
"fmla z11.s, p0/M, z6.s, z2.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z12.s, p0/M, z5.s, z2.s\n"
"fmla z13.s, p0/M, z4.s, z2.s\n"
- "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z14.s, p0/M, z6.s, z1.s\n"
"fmla z15.s, p0/M, z5.s, z1.s\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
"fmla z16.s, p0/M, z4.s, z1.s\n"
"fmla z17.s, p0/M, z6.s, z0.s\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
@@ -221,9 +220,10 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
"fmla z20.s, p0/M, z6.s, z3.s\n"
"fmla z21.s, p0/M, z5.s, z3.s\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
+ "addvl x22, x22, #3\n"
"fmla z22.s, p0/M, z4.s, z3.s\n"
"fmla z23.s, p0/M, z6.s, z2.s\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z24.s, p0/M, z5.s, z2.s\n"
"fmla z25.s, p0/M, z4.s, z2.s\n"
"fmla z26.s, p0/M, z6.s, z1.s\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index 43591e9201..f459fe92c9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -54,25 +54,25 @@ void sve_interleaved_fp32_mla_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
+ "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
"mov z10.b, #0x0\n"
- "ld1w { z4.s }, p0/Z, [x22]\n"
"mov z11.b, #0x0\n"
+ "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
"mov z12.b, #0x0\n"
- "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.b, #0x0\n"
+ "ld1w { z4.s }, p0/Z, [x22]\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
+ "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.b, #0x0\n"
- "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
"mov z17.b, #0x0\n"
+ "ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
"mov z18.b, #0x0\n"
- "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.b, #0x0\n"
"mov z20.b, #0x0\n"
- "ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
"mov z23.b, #0x0\n"
@@ -150,12 +150,12 @@ void sve_interleaved_fp32_mla_8x3VL(
"ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "addvl x22, x22, #3\n"
"fmla z8.s, z4.s, z0.s[0]\n"
"fmla z11.s, z4.s, z0.s[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z14.s, z4.s, z0.s[2]\n"
"fmla z17.s, z4.s, z0.s[3]\n"
+ "addvl x22, x22, #3\n"
"fmla z20.s, z4.s, z1.s[0]\n"
"fmla z23.s, z4.s, z1.s[1]\n"
"fmla z26.s, z4.s, z1.s[2]\n"
@@ -182,13 +182,13 @@ void sve_interleaved_fp32_mla_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ld1w { z2.s }, p0/Z, [x22]\n"
"ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1w { z0.s }, p0/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
"fmla z8.s, z2.s, z4.s[0]\n"
+ "ld1w { z0.s }, p0/Z, [x22, #2, MUL VL]\n"
"fmla z11.s, z2.s, z4.s[1]\n"
"fmla z14.s, z2.s, z4.s[2]\n"
"fmla z17.s, z2.s, z4.s[3]\n"
"fmla z20.s, z2.s, z3.s[0]\n"
+ "addvl x22, x22, #3\n"
"fmla z23.s, z2.s, z3.s[1]\n"
"fmla z26.s, z2.s, z3.s[2]\n"
"fmla z29.s, z2.s, z3.s[3]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
index 3a0e7f4c20..71e8551b92 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
@@ -35,8 +35,7 @@ void sve_interleaved_fp32_mmla_8x3VL(const float *, const float *, float *, int,
class cls_sve_interleaved_fp32_mmla_8x3VL {
public:
- typedef float lhs_operand_type;
- typedef float rhs_operand_type;
+ typedef float operand_type;
typedef float result_type;
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
@@ -58,7 +57,7 @@ public:
}
// Use the standard fixed size transforms.
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 2, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
kern_type kernel=sve_interleaved_fp32_mmla_8x3VL;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index ac731b76ed..ed1faeccd9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -40,8 +40,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx( ARGLIST );
class cls_sve_interleaved_s8s32_dot_8x3VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 4, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
index 1b33014f36..0159fe4923 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
@@ -55,31 +55,31 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
"ld1b { z0.b }, p0/Z, [x22]\n"
+ "mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
+ "mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
+ "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z16.s, #0x0\n"
- "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z17.s, #0x0\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z18.s, #0x0\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z19.s, #0x0\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z20.s, #0x0\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z21.s, #0x0\n"
+ "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z22.s, #0x0\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
@@ -93,7 +93,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"sdot z9.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
"sdot z10.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"sdot z11.s, z0.b, z4.b\n"
"sdot z12.s, z1.b, z4.b\n"
"sdot z13.s, z2.b, z4.b\n"
@@ -102,63 +102,63 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"sdot z15.s, z1.b, z5.b\n"
"cmp x20, #0x2\n"
"sdot z16.s, z2.b, z5.b\n"
- "ld1rw { z7.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
"sdot z17.s, z0.b, z6.b\n"
"sdot z18.s, z1.b, z6.b\n"
"sdot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "sdot z20.s, z0.b, z3.b\n"
- "sdot z21.s, z1.b, z3.b\n"
- "sdot z22.s, z2.b, z3.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+ "sdot z20.s, z0.b, z7.b\n"
+ "sdot z21.s, z1.b, z7.b\n"
+ "sdot z22.s, z2.b, z7.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
"sdot z23.s, z0.b, z4.b\n"
"sdot z24.s, z1.b, z4.b\n"
"sdot z25.s, z2.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "sdot z26.s, z0.b, z7.b\n"
- "sdot z27.s, z1.b, z7.b\n"
- "sdot z28.s, z2.b, z7.b\n"
+ "sdot z26.s, z0.b, z3.b\n"
+ "sdot z27.s, z1.b, z3.b\n"
+ "sdot z28.s, z2.b, z3.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
- "sdot z29.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #3, MUL VL]\n"
- "sdot z30.s, z1.b, z6.b\n"
- "sdot z31.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
- "addvl x22, x22, #6\n"
+ "sdot z29.s, z0.b, z5.b\n"
+ "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+ "sdot z30.s, z1.b, z5.b\n"
+ "sdot z31.s, z2.b, z5.b\n"
+ "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z7.b\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
- "sdot z8.s, z7.b, z5.b\n"
- "sdot z11.s, z7.b, z4.b\n"
- "sdot z9.s, z6.b, z5.b\n"
- "sdot z12.s, z6.b, z4.b\n"
- "sdot z10.s, z2.b, z5.b\n"
- "sdot z13.s, z2.b, z4.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #48]\n"
- "sdot z14.s, z7.b, z3.b\n"
- "sdot z15.s, z6.b, z3.b\n"
+ "sdot z9.s, z2.b, z7.b\n"
+ "sdot z10.s, z5.b, z7.b\n"
+ "sdot z11.s, z6.b, z4.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+ "sdot z12.s, z2.b, z4.b\n"
+ "sdot z13.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "sdot z16.s, z2.b, z3.b\n"
- "sdot z17.s, z7.b, z1.b\n"
+ "sdot z14.s, z6.b, z3.b\n"
+ "sdot z15.s, z2.b, z3.b\n"
+ "addvl x22, x22, #6\n"
+ "sdot z16.s, z5.b, z3.b\n"
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
- "sdot z18.s, z6.b, z1.b\n"
- "sdot z19.s, z2.b, z1.b\n"
+ "sdot z17.s, z6.b, z1.b\n"
+ "sdot z18.s, z2.b, z1.b\n"
+ "sdot z19.s, z5.b, z1.b\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "sdot z20.s, z7.b, z5.b\n"
- "sdot z21.s, z6.b, z5.b\n"
- "sdot z22.s, z2.b, z5.b\n"
- "sdot z23.s, z7.b, z4.b\n"
+ "sdot z20.s, z6.b, z7.b\n"
+ "sdot z21.s, z2.b, z7.b\n"
+ "sdot z22.s, z5.b, z7.b\n"
+ "sdot z23.s, z6.b, z4.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "sdot z24.s, z6.b, z4.b\n"
- "sdot z25.s, z2.b, z4.b\n"
+ "sdot z24.s, z2.b, z4.b\n"
+ "sdot z25.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "sdot z26.s, z7.b, z0.b\n"
- "sdot z27.s, z6.b, z0.b\n"
- "sdot z28.s, z2.b, z0.b\n"
- "sdot z29.s, z7.b, z1.b\n"
+ "sdot z26.s, z6.b, z0.b\n"
+ "sdot z27.s, z2.b, z0.b\n"
+ "sdot z28.s, z5.b, z0.b\n"
+ "sdot z29.s, z6.b, z1.b\n"
"ld1b { z0.b }, p0/Z, [x22]\n"
- "sdot z30.s, z6.b, z1.b\n"
- "sdot z31.s, z2.b, z1.b\n"
+ "sdot z30.s, z2.b, z1.b\n"
+ "sdot z31.s, z5.b, z1.b\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
"ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -200,20 +200,19 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
"ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "addvl x22, x22, #3\n"
+ "sdot z8.s, z6.b, z3.b\n"
"ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
- "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
- "sdot z8.s, z6.b, z3.b\n"
"sdot z9.s, z5.b, z3.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
"sdot z10.s, z4.b, z3.b\n"
"sdot z11.s, z6.b, z2.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"sdot z12.s, z5.b, z2.b\n"
"sdot z13.s, z4.b, z2.b\n"
- "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"sdot z14.s, z6.b, z1.b\n"
"sdot z15.s, z5.b, z1.b\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
"sdot z16.s, z4.b, z1.b\n"
"sdot z17.s, z6.b, z0.b\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
@@ -222,9 +221,10 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
"sdot z20.s, z6.b, z3.b\n"
"sdot z21.s, z5.b, z3.b\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
+ "addvl x22, x22, #3\n"
"sdot z22.s, z4.b, z3.b\n"
"sdot z23.s, z6.b, z2.b\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"sdot z24.s, z5.b, z2.b\n"
"sdot z25.s, z4.b, z2.b\n"
"sdot z26.s, z6.b, z1.b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index 1ddf171c7e..88a086fac7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -55,25 +55,25 @@ void sve_interleaved_s8s32_dot_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
+ "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z10.s, #0x0\n"
- "ld1b { z4.b }, p0/Z, [x22]\n"
"mov z11.s, #0x0\n"
+ "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z12.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.s, #0x0\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
+ "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.s, #0x0\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z17.s, #0x0\n"
+ "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z18.s, #0x0\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
- "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
@@ -151,12 +151,12 @@ void sve_interleaved_s8s32_dot_8x3VL(
"ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "addvl x22, x22, #3\n"
"sdot z8.s, z4.b, z0.b[0]\n"
"sdot z11.s, z4.b, z0.b[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"sdot z14.s, z4.b, z0.b[2]\n"
"sdot z17.s, z4.b, z0.b[3]\n"
+ "addvl x22, x22, #3\n"
"sdot z20.s, z4.b, z1.b[0]\n"
"sdot z23.s, z4.b, z1.b[1]\n"
"sdot z26.s, z4.b, z1.b[2]\n"
@@ -183,13 +183,13 @@ void sve_interleaved_s8s32_dot_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ld1b { z2.b }, p0/Z, [x22]\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
"sdot z8.s, z2.b, z4.b[0]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
"sdot z11.s, z2.b, z4.b[1]\n"
"sdot z14.s, z2.b, z4.b[2]\n"
"sdot z17.s, z2.b, z4.b[3]\n"
"sdot z20.s, z2.b, z3.b[0]\n"
+ "addvl x22, x22, #3\n"
"sdot z23.s, z2.b, z3.b[1]\n"
"sdot z26.s, z2.b, z3.b[2]\n"
"sdot z29.s, z2.b, z3.b[3]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 5ba3e51e6f..d86943c9b2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -39,8 +39,7 @@ void sve_interleaved_s8s32_mmla_8x3VL( ARGLIST );
class cls_sve_interleaved_s8s32_mmla_8x3VL
{
public:
- typedef int8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)( ARGLIST );
@@ -62,8 +61,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 8, 2> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index 261648eebe..afc8038956 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -55,31 +55,31 @@ void sve_interleaved_s8s32_mmla_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
+ "mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
+ "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z12.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.s, #0x0\n"
+ "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
+ "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.s, #0x0\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z17.s, #0x0\n"
+ "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
"mov z18.s, #0x0\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.s, #0x0\n"
+ "addvl x22, x22, #2\n"
"mov z20.s, #0x0\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
"mov z21.s, #0x0\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z22.s, #0x0\n"
- "addvl x22, x22, #2\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
@@ -94,77 +94,77 @@ void sve_interleaved_s8s32_mmla_8x3VL(
".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
- "ld1b { z3.b }, p0/Z, [x22]\n"
+ "ld1b { z7.b }, p0/Z, [x22]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
".inst 0x450498da // smmla z26.s, z6.b, z4.b\n"
".inst 0x450598dd // smmla z29.s, z6.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45039809 // smmla z9.s, z0.b, z3.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
+ ".inst 0x4503980c // smmla z12.s, z0.b, z3.b\n"
+ ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
+ ".inst 0x45039832 // smmla z18.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
- ".inst 0x4507980c // smmla z12.s, z0.b, z7.b\n"
- ".inst 0x4503982f // smmla z15.s, z1.b, z3.b\n"
+ ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
+ ".inst 0x45039858 // smmla z24.s, z2.b, z3.b\n"
"cmp x20, #0x2\n"
- ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
- ".inst 0x45039855 // smmla z21.s, z2.b, z3.b\n"
- ".inst 0x45079858 // smmla z24.s, z2.b, z7.b\n"
- ".inst 0x450398db // smmla z27.s, z6.b, z3.b\n"
+ ".inst 0x450798db // smmla z27.s, z6.b, z7.b\n"
+ ".inst 0x450398de // smmla z30.s, z6.b, z3.b\n"
"ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x450798de // smmla z30.s, z6.b, z7.b\n"
- ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
- ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ ".inst 0x4505980a // smmla z10.s, z0.b, z5.b\n"
+ ".inst 0x4504980d // smmla z13.s, z0.b, z4.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
- ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
+ ".inst 0x45059830 // smmla z16.s, z1.b, z5.b\n"
+ ".inst 0x45049833 // smmla z19.s, z1.b, z4.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
- ".inst 0x450498dc // smmla z28.s, z6.b, z4.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x450598df // smmla z31.s, z6.b, z5.b\n"
+ ".inst 0x45059856 // smmla z22.s, z2.b, z5.b\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
+ "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x450598dc // smmla z28.s, z6.b, z5.b\n"
+ ".inst 0x450498df // smmla z31.s, z6.b, z4.b\n"
+ "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
"ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
- "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
- "addvl x22, x22, #16\n"
+ "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
".inst 0x45039808 // smmla z8.s, z0.b, z3.b\n"
+ "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
+ "addvl x22, x22, #16\n"
".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
".inst 0x4503982e // smmla z14.s, z1.b, z3.b\n"
".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
- ".inst 0x45039854 // smmla z20.s, z2.b, z3.b\n"
- ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
+ ".inst 0x450398b4 // smmla z20.s, z5.b, z3.b\n"
+ ".inst 0x450798b7 // smmla z23.s, z5.b, z7.b\n"
".inst 0x450398da // smmla z26.s, z6.b, z3.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
".inst 0x450798dd // smmla z29.s, z6.b, z7.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
"ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x45049809 // smmla z9.s, z0.b, z4.b\n"
- ".inst 0x4505980c // smmla z12.s, z0.b, z5.b\n"
- ".inst 0x4504982f // smmla z15.s, z1.b, z4.b\n"
- ".inst 0x45059832 // smmla z18.s, z1.b, z5.b\n"
- ".inst 0x45049855 // smmla z21.s, z2.b, z4.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- ".inst 0x450498db // smmla z27.s, z6.b, z4.b\n"
+ ".inst 0x45029809 // smmla z9.s, z0.b, z2.b\n"
+ ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
+ ".inst 0x4502982f // smmla z15.s, z1.b, z2.b\n"
+ ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
+ ".inst 0x450298b5 // smmla z21.s, z5.b, z2.b\n"
+ ".inst 0x450498b8 // smmla z24.s, z5.b, z4.b\n"
+ ".inst 0x450298db // smmla z27.s, z6.b, z2.b\n"
+ ".inst 0x450498de // smmla z30.s, z6.b, z4.b\n"
"ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x450598de // smmla z30.s, z6.b, z5.b\n"
".inst 0x4503980a // smmla z10.s, z0.b, z3.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
- ".inst 0x45039830 // smmla z16.s, z1.b, z3.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
+ ".inst 0x45039830 // smmla z16.s, z1.b, z3.b\n"
".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
- ".inst 0x45039856 // smmla z22.s, z2.b, z3.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
+ ".inst 0x450398b6 // smmla z22.s, z5.b, z3.b\n"
+ ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
".inst 0x450398dc // smmla z28.s, z6.b, z3.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
".inst 0x450798df // smmla z31.s, z6.b, z7.b\n"
+ "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x22, x22, #-4\n"
"bge 3b\n"
"4:" // main loop skip
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
@@ -172,54 +172,54 @@ void sve_interleaved_s8s32_mmla_8x3VL(
"ld1b { z6.b }, p0/Z, [x22]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
- ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x450498fa // smmla z26.s, z7.b, z4.b\n"
+ ".inst 0x450598fd // smmla z29.s, z7.b, z5.b\n"
"ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x45069809 // smmla z9.s, z0.b, z6.b\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- ".inst 0x4507980c // smmla z12.s, z0.b, z7.b\n"
+ ".inst 0x4503980c // smmla z12.s, z0.b, z3.b\n"
".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- "addvl x22, x22, #4\n"
- ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ ".inst 0x45039832 // smmla z18.s, z1.b, z3.b\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x45079858 // smmla z24.s, z2.b, z7.b\n"
- ".inst 0x4506987b // smmla z27.s, z3.b, z6.b\n"
- ".inst 0x4507987e // smmla z30.s, z3.b, z7.b\n"
+ ".inst 0x45039858 // smmla z24.s, z2.b, z3.b\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0x450698fb // smmla z27.s, z7.b, z6.b\n"
+ ".inst 0x450398fe // smmla z30.s, z7.b, z3.b\n"
".inst 0x4505980a // smmla z10.s, z0.b, z5.b\n"
".inst 0x4504980d // smmla z13.s, z0.b, z4.b\n"
".inst 0x45059830 // smmla z16.s, z1.b, z5.b\n"
".inst 0x45049833 // smmla z19.s, z1.b, z4.b\n"
".inst 0x45059856 // smmla z22.s, z2.b, z5.b\n"
".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
- ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n"
- ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n"
+ ".inst 0x450598fc // smmla z28.s, z7.b, z5.b\n"
+ ".inst 0x450498ff // smmla z31.s, z7.b, z4.b\n"
"cbz x20, 5f\n"
"ld1b { z1.b }, p0/Z, [x22]\n"
"ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
"ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
"ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x450098eb // smmla z11.s, z7.b, z0.b\n"
"ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
"ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x450098eb // smmla z11.s, z7.b, z0.b\n"
".inst 0x450198ce // smmla z14.s, z6.b, z1.b\n"
".inst 0x450098d1 // smmla z17.s, z6.b, z0.b\n"
".inst 0x450198b4 // smmla z20.s, z5.b, z1.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n"
".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n"
- "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
"ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
".inst 0x450398e9 // smmla z9.s, z7.b, z3.b\n"
".inst 0x450298ec // smmla z12.s, z7.b, z2.b\n"
- ".inst 0x450398cf // smmla z15.s, z6.b, z3.b\n"
"addvl x22, x22, #6\n"
+ ".inst 0x450398cf // smmla z15.s, z6.b, z3.b\n"
".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x450398b5 // smmla z21.s, z5.b, z3.b\n"
".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
".inst 0x4503989b // smmla z27.s, z4.b, z3.b\n"
@@ -233,53 +233,53 @@ void sve_interleaved_s8s32_mmla_8x3VL(
".inst 0x4501989c // smmla z28.s, z4.b, z1.b\n"
".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"5:" // multiply loop done
- "uzp1 z2.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "subs x23, x23, #0x1\n"
- "uzp1 z1.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z3.d, z14.d, z17.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "uzp1 z0.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
+ "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
+ "subs x23, x23, #0x1\n"
+ "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "uzp1 z2.d, z16.d, z19.d\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z1.d, z20.d, z23.d\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z0.d, z21.d, z24.d\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "uzp1 z23.d, z21.d, z24.d\n"
"uzp2 z21.d, z21.d, z24.d\n"
- "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "uzp1 z23.d, z22.d, z25.d\n"
+ "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
- "st1w { z3.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "uzp1 z19.d, z26.d, z29.d\n"
+ "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #16\n"
- "uzp1 z18.d, z27.d, z30.d\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
- "uzp1 z17.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
- "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z23.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL.hpp
deleted file mode 100644
index 072ffee1cc..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../std_transforms_sve.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const uint8_t *, const int8_t *, \
- int32_t *, int, int, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void sve_interleaved_u8s8s32_mmla_8x3VL( ARGLIST );
-
-class cls_sve_interleaved_u8s8s32_mmla_8x3VL
-{
-public:
- typedef uint8_t lhs_operand_type;
- typedef int8_t rhs_operand_type;
- typedef int32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<int32_t>() * 3;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 8;
- }
-
-
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 8, 2> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, uint32_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 61.97, 4.11, 7.93 };
- case CPUModel::A510:
- return { 43.18, 3.57, 2.89 };
- case CPUModel::V1:
- return { 123.47, 5.03, 11.76 };
- }
- }
-
-
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- default:
- return { 62.00, 4.08, 0.51 };
- case CPUModel::A510:
- return { 38.02, 1.85, 0.28 };
- case CPUModel::V1:
- return { 95.28, 7.99, 0.79 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=sve_interleaved_u8s8s32_mmla_8x3VL;
- cls_sve_interleaved_u8s8s32_mmla_8x3VL(const CPUInfo *)
- {
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL/generic.cpp
deleted file mode 100644
index 50a1713b89..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8s8s32_mmla_8x3VL/generic.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void sve_interleaved_u8s8s32_mmla_8x3VL(
- const uint8_t *Apanel,
- const int8_t *Bpanel,
- int32_t *Cpanel,
- int ablocks,
- int bblocks,
- int K) {
-
- struct KernelArgs {
- size_t K = {};
- const int8_t *Bpanel = {};
- size_t bblocks = {};
- } ka;
-
- ka.K = (K/8) - 1;
- ka.Bpanel = Bpanel;
- ka.bblocks = bblocks;
-
- __asm__ __volatile__(
- "ptrue p0.b\n"
- "1:" // Height loop
- "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "mov x21, %x[Apanel]\n"
- "2:" // Width loop
- "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "mov z8.s, #0x0\n"
- "mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
- "ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
- "mov z13.s, #0x0\n"
- "mov z14.s, #0x0\n"
- "mov z15.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
- "mov z17.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
- "mov z19.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
- "mov z21.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "addvl x22, x22, #2\n"
- "mov z23.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
- "mov z25.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x45849808 // usmmla z8.s, z0.b, z4.b\n"
- ".inst 0x4585980b // usmmla z11.s, z0.b, z5.b\n"
- ".inst 0x4584982e // usmmla z14.s, z1.b, z4.b\n"
- ".inst 0x45859831 // usmmla z17.s, z1.b, z5.b\n"
- "ld1b { z3.b }, p0/Z, [x22]\n"
- ".inst 0x45849854 // usmmla z20.s, z2.b, z4.b\n"
- ".inst 0x45859857 // usmmla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x458498da // usmmla z26.s, z6.b, z4.b\n"
- ".inst 0x458598dd // usmmla z29.s, z6.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45839809 // usmmla z9.s, z0.b, z3.b\n"
- "sub x20, x20, #0x2\n"
- ".inst 0x4587980c // usmmla z12.s, z0.b, z7.b\n"
- ".inst 0x4583982f // usmmla z15.s, z1.b, z3.b\n"
- "cmp x20, #0x2\n"
- ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n"
- ".inst 0x45839855 // usmmla z21.s, z2.b, z3.b\n"
- ".inst 0x45879858 // usmmla z24.s, z2.b, z7.b\n"
- ".inst 0x458398db // usmmla z27.s, z6.b, z3.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x458798de // usmmla z30.s, z6.b, z7.b\n"
- ".inst 0x4584980a // usmmla z10.s, z0.b, z4.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x4585980d // usmmla z13.s, z0.b, z5.b\n"
- ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n"
- ".inst 0x45849856 // usmmla z22.s, z2.b, z4.b\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x45859859 // usmmla z25.s, z2.b, z5.b\n"
- ".inst 0x458498dc // usmmla z28.s, z6.b, z4.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x458598df // usmmla z31.s, z6.b, z5.b\n"
- "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
- "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
- "addvl x22, x22, #16\n"
- ".inst 0x45839808 // usmmla z8.s, z0.b, z3.b\n"
- ".inst 0x4587980b // usmmla z11.s, z0.b, z7.b\n"
- ".inst 0x4583982e // usmmla z14.s, z1.b, z3.b\n"
- ".inst 0x45879831 // usmmla z17.s, z1.b, z7.b\n"
- ".inst 0x45839854 // usmmla z20.s, z2.b, z3.b\n"
- ".inst 0x45879857 // usmmla z23.s, z2.b, z7.b\n"
- ".inst 0x458398da // usmmla z26.s, z6.b, z3.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
- ".inst 0x458798dd // usmmla z29.s, z6.b, z7.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x45849809 // usmmla z9.s, z0.b, z4.b\n"
- ".inst 0x4585980c // usmmla z12.s, z0.b, z5.b\n"
- ".inst 0x4584982f // usmmla z15.s, z1.b, z4.b\n"
- ".inst 0x45859832 // usmmla z18.s, z1.b, z5.b\n"
- ".inst 0x45849855 // usmmla z21.s, z2.b, z4.b\n"
- ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n"
- ".inst 0x458498db // usmmla z27.s, z6.b, z4.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x458598de // usmmla z30.s, z6.b, z5.b\n"
- ".inst 0x4583980a // usmmla z10.s, z0.b, z3.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
- ".inst 0x4587980d // usmmla z13.s, z0.b, z7.b\n"
- ".inst 0x45839830 // usmmla z16.s, z1.b, z3.b\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
- ".inst 0x45879833 // usmmla z19.s, z1.b, z7.b\n"
- ".inst 0x45839856 // usmmla z22.s, z2.b, z3.b\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n"
- ".inst 0x458398dc // usmmla z28.s, z6.b, z3.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
- ".inst 0x458798df // usmmla z31.s, z6.b, z7.b\n"
- "add %x[Apanel], %x[Apanel], #0x80\n"
- "addvl x22, x22, #-4\n"
- "bge 3b\n"
- "4:" // main loop skip
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x45849808 // usmmla z8.s, z0.b, z4.b\n"
- ".inst 0x4585980b // usmmla z11.s, z0.b, z5.b\n"
- ".inst 0x4584982e // usmmla z14.s, z1.b, z4.b\n"
- ".inst 0x45859831 // usmmla z17.s, z1.b, z5.b\n"
- "ld1b { z6.b }, p0/Z, [x22]\n"
- ".inst 0x45849854 // usmmla z20.s, z2.b, z4.b\n"
- ".inst 0x45859857 // usmmla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x4584987a // usmmla z26.s, z3.b, z4.b\n"
- ".inst 0x4585987d // usmmla z29.s, z3.b, z5.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45869809 // usmmla z9.s, z0.b, z6.b\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- ".inst 0x4587980c // usmmla z12.s, z0.b, z7.b\n"
- ".inst 0x4586982f // usmmla z15.s, z1.b, z6.b\n"
- "addvl x22, x22, #4\n"
- ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n"
- ".inst 0x45869855 // usmmla z21.s, z2.b, z6.b\n"
- ".inst 0x45879858 // usmmla z24.s, z2.b, z7.b\n"
- ".inst 0x4586987b // usmmla z27.s, z3.b, z6.b\n"
- ".inst 0x4587987e // usmmla z30.s, z3.b, z7.b\n"
- ".inst 0x4585980a // usmmla z10.s, z0.b, z5.b\n"
- ".inst 0x4584980d // usmmla z13.s, z0.b, z4.b\n"
- ".inst 0x45859830 // usmmla z16.s, z1.b, z5.b\n"
- ".inst 0x45849833 // usmmla z19.s, z1.b, z4.b\n"
- ".inst 0x45859856 // usmmla z22.s, z2.b, z5.b\n"
- ".inst 0x45849859 // usmmla z25.s, z2.b, z4.b\n"
- ".inst 0x4585987c // usmmla z28.s, z3.b, z5.b\n"
- ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n"
- "cbz x20, 5f\n"
- "ld1b { z1.b }, p0/Z, [x22]\n"
- "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
- "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
- "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
- "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x458198e8 // usmmla z8.s, z7.b, z1.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x458098eb // usmmla z11.s, z7.b, z0.b\n"
- ".inst 0x458198ce // usmmla z14.s, z6.b, z1.b\n"
- ".inst 0x458098d1 // usmmla z17.s, z6.b, z0.b\n"
- ".inst 0x458198b4 // usmmla z20.s, z5.b, z1.b\n"
- ".inst 0x458098b7 // usmmla z23.s, z5.b, z0.b\n"
- ".inst 0x4581989a // usmmla z26.s, z4.b, z1.b\n"
- "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x4580989d // usmmla z29.s, z4.b, z0.b\n"
- "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x458398e9 // usmmla z9.s, z7.b, z3.b\n"
- ".inst 0x458298ec // usmmla z12.s, z7.b, z2.b\n"
- ".inst 0x458398cf // usmmla z15.s, z6.b, z3.b\n"
- "addvl x22, x22, #6\n"
- ".inst 0x458298d2 // usmmla z18.s, z6.b, z2.b\n"
- ".inst 0x458398b5 // usmmla z21.s, z5.b, z3.b\n"
- ".inst 0x458298b8 // usmmla z24.s, z5.b, z2.b\n"
- ".inst 0x4583989b // usmmla z27.s, z4.b, z3.b\n"
- ".inst 0x4582989e // usmmla z30.s, z4.b, z2.b\n"
- ".inst 0x458198ea // usmmla z10.s, z7.b, z1.b\n"
- ".inst 0x458098ed // usmmla z13.s, z7.b, z0.b\n"
- ".inst 0x458198d0 // usmmla z16.s, z6.b, z1.b\n"
- ".inst 0x458098d3 // usmmla z19.s, z6.b, z0.b\n"
- ".inst 0x458198b6 // usmmla z22.s, z5.b, z1.b\n"
- ".inst 0x458098b9 // usmmla z25.s, z5.b, z0.b\n"
- ".inst 0x4581989c // usmmla z28.s, z4.b, z1.b\n"
- ".inst 0x4580989f // usmmla z31.s, z4.b, z0.b\n"
- "5:" // multiply loop done
- "uzp1 z2.d, z8.d, z11.d\n"
- "uzp2 z8.d, z8.d, z11.d\n"
- "subs x23, x23, #0x1\n"
- "uzp1 z1.d, z9.d, z12.d\n"
- "uzp2 z9.d, z9.d, z12.d\n"
- "uzp1 z0.d, z10.d, z13.d\n"
- "uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z3.d, z14.d, z17.d\n"
- "uzp2 z14.d, z14.d, z17.d\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
- "uzp2 z15.d, z15.d, z18.d\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "uzp1 z2.d, z16.d, z19.d\n"
- "uzp2 z16.d, z16.d, z19.d\n"
- "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z1.d, z20.d, z23.d\n"
- "uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z0.d, z21.d, z24.d\n"
- "uzp2 z21.d, z21.d, z24.d\n"
- "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "uzp1 z23.d, z22.d, z25.d\n"
- "uzp2 z22.d, z22.d, z25.d\n"
- "st1w { z3.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "uzp1 z19.d, z26.d, z29.d\n"
- "uzp2 z26.d, z26.d, z29.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #16\n"
- "uzp1 z18.d, z27.d, z30.d\n"
- "uzp2 z27.d, z27.d, z30.d\n"
- "uzp1 z17.d, z28.d, z31.d\n"
- "uzp2 z28.d, z28.d, z31.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
- "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z23.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
- "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
- "st1w { z21.s }, p0, [%x[Cpanel]]\n"
- "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #8\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index bcb3279adb..171c810c8f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -40,8 +40,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx( ARGLIST );
class cls_sve_interleaved_u8u32_dot_8x3VL
{
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)( ARGLIST );
@@ -63,8 +62,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 4, 1> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
index 171cf38fa6..e2151ef41a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
@@ -55,31 +55,31 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
"ld1b { z0.b }, p0/Z, [x22]\n"
+ "mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
- "mov z12.s, #0x0\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
+ "mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
+ "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z16.s, #0x0\n"
- "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z17.s, #0x0\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z18.s, #0x0\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
"mov z19.s, #0x0\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z20.s, #0x0\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
"mov z21.s, #0x0\n"
+ "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z22.s, #0x0\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
@@ -93,7 +93,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"udot z9.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
"udot z10.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"udot z11.s, z0.b, z4.b\n"
"udot z12.s, z1.b, z4.b\n"
"udot z13.s, z2.b, z4.b\n"
@@ -102,63 +102,63 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"udot z15.s, z1.b, z5.b\n"
"cmp x20, #0x2\n"
"udot z16.s, z2.b, z5.b\n"
- "ld1rw { z7.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
"udot z17.s, z0.b, z6.b\n"
"udot z18.s, z1.b, z6.b\n"
"udot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "udot z20.s, z0.b, z3.b\n"
- "udot z21.s, z1.b, z3.b\n"
- "udot z22.s, z2.b, z3.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+ "udot z20.s, z0.b, z7.b\n"
+ "udot z21.s, z1.b, z7.b\n"
+ "udot z22.s, z2.b, z7.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
"udot z23.s, z0.b, z4.b\n"
"udot z24.s, z1.b, z4.b\n"
"udot z25.s, z2.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "udot z26.s, z0.b, z7.b\n"
- "udot z27.s, z1.b, z7.b\n"
- "udot z28.s, z2.b, z7.b\n"
+ "udot z26.s, z0.b, z3.b\n"
+ "udot z27.s, z1.b, z3.b\n"
+ "udot z28.s, z2.b, z3.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
- "udot z29.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #3, MUL VL]\n"
- "udot z30.s, z1.b, z6.b\n"
- "udot z31.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
- "addvl x22, x22, #6\n"
+ "udot z29.s, z0.b, z5.b\n"
+ "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+ "udot z30.s, z1.b, z5.b\n"
+ "udot z31.s, z2.b, z5.b\n"
+ "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z7.b\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
- "udot z8.s, z7.b, z5.b\n"
- "udot z11.s, z7.b, z4.b\n"
- "udot z9.s, z6.b, z5.b\n"
- "udot z12.s, z6.b, z4.b\n"
- "udot z10.s, z2.b, z5.b\n"
- "udot z13.s, z2.b, z4.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #48]\n"
- "udot z14.s, z7.b, z3.b\n"
- "udot z15.s, z6.b, z3.b\n"
+ "udot z9.s, z2.b, z7.b\n"
+ "udot z10.s, z5.b, z7.b\n"
+ "udot z11.s, z6.b, z4.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+ "udot z12.s, z2.b, z4.b\n"
+ "udot z13.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "udot z16.s, z2.b, z3.b\n"
- "udot z17.s, z7.b, z1.b\n"
+ "udot z14.s, z6.b, z3.b\n"
+ "udot z15.s, z2.b, z3.b\n"
+ "addvl x22, x22, #6\n"
+ "udot z16.s, z5.b, z3.b\n"
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
- "udot z18.s, z6.b, z1.b\n"
- "udot z19.s, z2.b, z1.b\n"
+ "udot z17.s, z6.b, z1.b\n"
+ "udot z18.s, z2.b, z1.b\n"
+ "udot z19.s, z5.b, z1.b\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "udot z20.s, z7.b, z5.b\n"
- "udot z21.s, z6.b, z5.b\n"
- "udot z22.s, z2.b, z5.b\n"
- "udot z23.s, z7.b, z4.b\n"
+ "udot z20.s, z6.b, z7.b\n"
+ "udot z21.s, z2.b, z7.b\n"
+ "udot z22.s, z5.b, z7.b\n"
+ "udot z23.s, z6.b, z4.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "udot z24.s, z6.b, z4.b\n"
- "udot z25.s, z2.b, z4.b\n"
+ "udot z24.s, z2.b, z4.b\n"
+ "udot z25.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "udot z26.s, z7.b, z0.b\n"
- "udot z27.s, z6.b, z0.b\n"
- "udot z28.s, z2.b, z0.b\n"
- "udot z29.s, z7.b, z1.b\n"
+ "udot z26.s, z6.b, z0.b\n"
+ "udot z27.s, z2.b, z0.b\n"
+ "udot z28.s, z5.b, z0.b\n"
+ "udot z29.s, z6.b, z1.b\n"
"ld1b { z0.b }, p0/Z, [x22]\n"
- "udot z30.s, z6.b, z1.b\n"
- "udot z31.s, z2.b, z1.b\n"
+ "udot z30.s, z2.b, z1.b\n"
+ "udot z31.s, z5.b, z1.b\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
"ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -200,20 +200,19 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
"ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "addvl x22, x22, #3\n"
+ "udot z8.s, z6.b, z3.b\n"
"ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
- "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
- "udot z8.s, z6.b, z3.b\n"
"udot z9.s, z5.b, z3.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
"udot z10.s, z4.b, z3.b\n"
"udot z11.s, z6.b, z2.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"udot z12.s, z5.b, z2.b\n"
"udot z13.s, z4.b, z2.b\n"
- "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"udot z14.s, z6.b, z1.b\n"
"udot z15.s, z5.b, z1.b\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
"udot z16.s, z4.b, z1.b\n"
"udot z17.s, z6.b, z0.b\n"
"ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
@@ -222,9 +221,10 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
"udot z20.s, z6.b, z3.b\n"
"udot z21.s, z5.b, z3.b\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
+ "addvl x22, x22, #3\n"
"udot z22.s, z4.b, z3.b\n"
"udot z23.s, z6.b, z2.b\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"udot z24.s, z5.b, z2.b\n"
"udot z25.s, z4.b, z2.b\n"
"udot z26.s, z6.b, z1.b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index dc31a73c13..eaa3ad2428 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -55,25 +55,25 @@ void sve_interleaved_u8u32_dot_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
+ "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z10.s, #0x0\n"
- "ld1b { z4.b }, p0/Z, [x22]\n"
"mov z11.s, #0x0\n"
+ "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z12.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.s, #0x0\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
+ "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.s, #0x0\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z17.s, #0x0\n"
+ "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z18.s, #0x0\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
- "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
@@ -151,12 +151,12 @@ void sve_interleaved_u8u32_dot_8x3VL(
"ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "addvl x22, x22, #3\n"
"udot z8.s, z4.b, z0.b[0]\n"
"udot z11.s, z4.b, z0.b[1]\n"
+ "add %x[Apanel], %x[Apanel], #0x20\n"
"udot z14.s, z4.b, z0.b[2]\n"
"udot z17.s, z4.b, z0.b[3]\n"
+ "addvl x22, x22, #3\n"
"udot z20.s, z4.b, z1.b[0]\n"
"udot z23.s, z4.b, z1.b[1]\n"
"udot z26.s, z4.b, z1.b[2]\n"
@@ -183,13 +183,13 @@ void sve_interleaved_u8u32_dot_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"ld1b { z2.b }, p0/Z, [x22]\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
"udot z8.s, z2.b, z4.b[0]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
"udot z11.s, z2.b, z4.b[1]\n"
"udot z14.s, z2.b, z4.b[2]\n"
"udot z17.s, z2.b, z4.b[3]\n"
"udot z20.s, z2.b, z3.b[0]\n"
+ "addvl x22, x22, #3\n"
"udot z23.s, z2.b, z3.b[1]\n"
"udot z26.s, z2.b, z3.b[2]\n"
"udot z29.s, z2.b, z3.b[3]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index f6a526d879..3bbf2bbfe4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -39,8 +39,7 @@ void sve_interleaved_u8u32_mmla_8x3VL( ARGLIST );
class cls_sve_interleaved_u8u32_mmla_8x3VL
{
public:
- typedef uint8_t lhs_operand_type;
- typedef uint8_t rhs_operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)( ARGLIST );
@@ -62,8 +61,8 @@ public:
}
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 8, 2> transforms = {};
- StdTransformsSVE<lhs_operand_type, rhs_operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index e5389a771d..c66026104d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -55,31 +55,31 @@ void sve_interleaved_u8u32_mmla_8x3VL(
"2:" // Width loop
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"mov %x[Apanel], x21\n"
+ "cmp x20, #0x2\n"
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
- "mov z10.s, #0x0\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
+ "mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
+ "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z12.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "cmp x20, #0x2\n"
"mov z13.s, #0x0\n"
+ "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
+ "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
"mov z16.s, #0x0\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
"mov z17.s, #0x0\n"
+ "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
"mov z18.s, #0x0\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
"mov z19.s, #0x0\n"
+ "addvl x22, x22, #2\n"
"mov z20.s, #0x0\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
"mov z21.s, #0x0\n"
+ "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z22.s, #0x0\n"
- "addvl x22, x22, #2\n"
"mov z23.s, #0x0\n"
"mov z24.s, #0x0\n"
- "add %x[Apanel], %x[Apanel], #0x30\n"
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
@@ -94,77 +94,77 @@ void sve_interleaved_u8u32_mmla_8x3VL(
".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
- "ld1b { z3.b }, p0/Z, [x22]\n"
+ "ld1b { z7.b }, p0/Z, [x22]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
".inst 0x45c498da // ummla z26.s, z6.b, z4.b\n"
".inst 0x45c598dd // ummla z29.s, z6.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45c39809 // ummla z9.s, z0.b, z3.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
+ ".inst 0x45c3980c // ummla z12.s, z0.b, z3.b\n"
+ ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
+ ".inst 0x45c39832 // ummla z18.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
- ".inst 0x45c7980c // ummla z12.s, z0.b, z7.b\n"
- ".inst 0x45c3982f // ummla z15.s, z1.b, z3.b\n"
+ ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
+ ".inst 0x45c39858 // ummla z24.s, z2.b, z3.b\n"
"cmp x20, #0x2\n"
- ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
- ".inst 0x45c39855 // ummla z21.s, z2.b, z3.b\n"
- ".inst 0x45c79858 // ummla z24.s, z2.b, z7.b\n"
- ".inst 0x45c398db // ummla z27.s, z6.b, z3.b\n"
+ ".inst 0x45c798db // ummla z27.s, z6.b, z7.b\n"
+ ".inst 0x45c398de // ummla z30.s, z6.b, z3.b\n"
"ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x45c798de // ummla z30.s, z6.b, z7.b\n"
- ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
- ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ ".inst 0x45c5980a // ummla z10.s, z0.b, z5.b\n"
+ ".inst 0x45c4980d // ummla z13.s, z0.b, z4.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
- ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
+ ".inst 0x45c59830 // ummla z16.s, z1.b, z5.b\n"
+ ".inst 0x45c49833 // ummla z19.s, z1.b, z4.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
- ".inst 0x45c498dc // ummla z28.s, z6.b, z4.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x45c598df // ummla z31.s, z6.b, z5.b\n"
+ ".inst 0x45c59856 // ummla z22.s, z2.b, z5.b\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
+ "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x45c598dc // ummla z28.s, z6.b, z5.b\n"
+ ".inst 0x45c498df // ummla z31.s, z6.b, z4.b\n"
+ "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
"ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
- "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
- "addvl x22, x22, #16\n"
+ "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
".inst 0x45c39808 // ummla z8.s, z0.b, z3.b\n"
+ "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
+ "addvl x22, x22, #16\n"
".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
".inst 0x45c3982e // ummla z14.s, z1.b, z3.b\n"
".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
- ".inst 0x45c39854 // ummla z20.s, z2.b, z3.b\n"
- ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
+ ".inst 0x45c398b4 // ummla z20.s, z5.b, z3.b\n"
+ ".inst 0x45c798b7 // ummla z23.s, z5.b, z7.b\n"
".inst 0x45c398da // ummla z26.s, z6.b, z3.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
".inst 0x45c798dd // ummla z29.s, z6.b, z7.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
"ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x45c49809 // ummla z9.s, z0.b, z4.b\n"
- ".inst 0x45c5980c // ummla z12.s, z0.b, z5.b\n"
- ".inst 0x45c4982f // ummla z15.s, z1.b, z4.b\n"
- ".inst 0x45c59832 // ummla z18.s, z1.b, z5.b\n"
- ".inst 0x45c49855 // ummla z21.s, z2.b, z4.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- ".inst 0x45c498db // ummla z27.s, z6.b, z4.b\n"
+ ".inst 0x45c29809 // ummla z9.s, z0.b, z2.b\n"
+ ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
+ ".inst 0x45c2982f // ummla z15.s, z1.b, z2.b\n"
+ ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
+ ".inst 0x45c298b5 // ummla z21.s, z5.b, z2.b\n"
+ ".inst 0x45c498b8 // ummla z24.s, z5.b, z4.b\n"
+ ".inst 0x45c298db // ummla z27.s, z6.b, z2.b\n"
+ ".inst 0x45c498de // ummla z30.s, z6.b, z4.b\n"
"ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x45c598de // ummla z30.s, z6.b, z5.b\n"
".inst 0x45c3980a // ummla z10.s, z0.b, z3.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
- ".inst 0x45c39830 // ummla z16.s, z1.b, z3.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
+ ".inst 0x45c39830 // ummla z16.s, z1.b, z3.b\n"
".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
- ".inst 0x45c39856 // ummla z22.s, z2.b, z3.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
+ ".inst 0x45c398b6 // ummla z22.s, z5.b, z3.b\n"
+ ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
".inst 0x45c398dc // ummla z28.s, z6.b, z3.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
".inst 0x45c798df // ummla z31.s, z6.b, z7.b\n"
+ "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x22, x22, #-4\n"
"bge 3b\n"
"4:" // main loop skip
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
@@ -172,54 +172,54 @@ void sve_interleaved_u8u32_mmla_8x3VL(
"ld1b { z6.b }, p0/Z, [x22]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
- ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x45c498fa // ummla z26.s, z7.b, z4.b\n"
+ ".inst 0x45c598fd // ummla z29.s, z7.b, z5.b\n"
"ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x45c69809 // ummla z9.s, z0.b, z6.b\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- ".inst 0x45c7980c // ummla z12.s, z0.b, z7.b\n"
+ ".inst 0x45c3980c // ummla z12.s, z0.b, z3.b\n"
".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- "addvl x22, x22, #4\n"
- ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ ".inst 0x45c39832 // ummla z18.s, z1.b, z3.b\n"
+ "add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- ".inst 0x45c79858 // ummla z24.s, z2.b, z7.b\n"
- ".inst 0x45c6987b // ummla z27.s, z3.b, z6.b\n"
- ".inst 0x45c7987e // ummla z30.s, z3.b, z7.b\n"
+ ".inst 0x45c39858 // ummla z24.s, z2.b, z3.b\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0x45c698fb // ummla z27.s, z7.b, z6.b\n"
+ ".inst 0x45c398fe // ummla z30.s, z7.b, z3.b\n"
".inst 0x45c5980a // ummla z10.s, z0.b, z5.b\n"
".inst 0x45c4980d // ummla z13.s, z0.b, z4.b\n"
".inst 0x45c59830 // ummla z16.s, z1.b, z5.b\n"
".inst 0x45c49833 // ummla z19.s, z1.b, z4.b\n"
".inst 0x45c59856 // ummla z22.s, z2.b, z5.b\n"
".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
- ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n"
- ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n"
+ ".inst 0x45c598fc // ummla z28.s, z7.b, z5.b\n"
+ ".inst 0x45c498ff // ummla z31.s, z7.b, z4.b\n"
"cbz x20, 5f\n"
"ld1b { z1.b }, p0/Z, [x22]\n"
"ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n"
"ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
"ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x45c098eb // ummla z11.s, z7.b, z0.b\n"
"ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
"ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n"
- "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45c098eb // ummla z11.s, z7.b, z0.b\n"
".inst 0x45c198ce // ummla z14.s, z6.b, z1.b\n"
".inst 0x45c098d1 // ummla z17.s, z6.b, z0.b\n"
".inst 0x45c198b4 // ummla z20.s, z5.b, z1.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
".inst 0x45c098b7 // ummla z23.s, z5.b, z0.b\n"
".inst 0x45c1989a // ummla z26.s, z4.b, z1.b\n"
- "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
"ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
".inst 0x45c398e9 // ummla z9.s, z7.b, z3.b\n"
".inst 0x45c298ec // ummla z12.s, z7.b, z2.b\n"
- ".inst 0x45c398cf // ummla z15.s, z6.b, z3.b\n"
"addvl x22, x22, #6\n"
+ ".inst 0x45c398cf // ummla z15.s, z6.b, z3.b\n"
".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n"
+ "add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x45c398b5 // ummla z21.s, z5.b, z3.b\n"
".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n"
".inst 0x45c3989b // ummla z27.s, z4.b, z3.b\n"
@@ -233,53 +233,53 @@ void sve_interleaved_u8u32_mmla_8x3VL(
".inst 0x45c1989c // ummla z28.s, z4.b, z1.b\n"
".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n"
"5:" // multiply loop done
- "uzp1 z2.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "subs x23, x23, #0x1\n"
- "uzp1 z1.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z3.d, z14.d, z17.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "uzp1 z0.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
+ "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
+ "subs x23, x23, #0x1\n"
+ "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "uzp1 z2.d, z16.d, z19.d\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z1.d, z20.d, z23.d\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z0.d, z21.d, z24.d\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "uzp1 z23.d, z21.d, z24.d\n"
"uzp2 z21.d, z21.d, z24.d\n"
- "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
- "uzp1 z23.d, z22.d, z25.d\n"
+ "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
- "st1w { z3.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
- "uzp1 z19.d, z26.d, z29.d\n"
+ "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
- "addvl %x[Cpanel], %x[Cpanel], #16\n"
- "uzp1 z18.d, z27.d, z30.d\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
- "uzp1 z17.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
- "st1w { z2.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
- "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "st1w { z1.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "st1w { z0.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z23.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
index 57f779c498..ea00cccae5 100644
--- a/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
+++ b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
@@ -26,9 +26,9 @@
namespace arm_gemm {
struct PerformanceParameters {
- float kernel_macs_cycle;
- float prepare_bytes_cycle = 0.0f;
- float merge_bytes_cycle = 0.0f;
+ float kernel_macs_cycle;
+ float prepare_bytes_cycle = 0.0f;
+ float merge_bytes_cycle = 0.0f;
PerformanceParameters(float k) : kernel_macs_cycle(k) { }
PerformanceParameters(float k, float p, float m) : kernel_macs_cycle(k), prepare_bytes_cycle(p), merge_bytes_cycle(m) { }
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index 1a90cf7d89..d35825c428 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -35,9 +35,9 @@ namespace arm_gemm {
/* Quantized wrapper - do an integer GEMM and wrap around the quantization. */
template<typename To, typename Tr, typename Tgemm>
-class QuantizeWrapper : public GemmCommon<To, To, Tr> {
+class QuantizeWrapper : public GemmCommon<To, Tr> {
private:
- UniqueGemmCommon<To, To, Tgemm> _subgemm = nullptr;
+ UniqueGemmCommon<To, Tgemm> _subgemm = nullptr;
int32_t *_row_sums = nullptr;
int32_t *_col_sums = nullptr;
Requantize32 _params;
@@ -111,7 +111,7 @@ public:
QuantizeWrapper(const GemmArgs &args, const Requantize32 &qp) : _params(qp), _args(args), _barrier(args._maxthreads) {
GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._Ksections, args._nbatches, args._nmulti, args._indirect_input, Activation(), args._maxthreads);
- _subgemm = gemm<To, To, Tgemm>(newargs);
+ _subgemm = gemm<To, Tgemm>(newargs);
if (_subgemm == nullptr) {
return;
@@ -122,7 +122,7 @@ public:
const To *B, const int ldb, const int B_multi_stride,
Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
const Tr *bias, const int bias_multi_stride) override {
- GemmCommon<To, To, Tr>::set_arrays(A, lda, A_batch_stride, A_multi_stride, B, ldb, B_multi_stride, C, ldc, C_batch_stride, C_multi_stride, bias, bias_multi_stride);
+ GemmCommon<To, Tr>::set_arrays(A, lda, A_batch_stride, A_multi_stride, B, ldb, B_multi_stride, C, ldc, C_batch_stride, C_multi_stride, bias, bias_multi_stride);
arrays_set = true;
set_child_arrays();
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index f24d5956e5..6da9f4be0e 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -745,10 +745,6 @@ template void requantize_block_32(const Requantize32 &qp, unsigned int width, un
const uint32_t *input, unsigned int in_stride, uint8_t *output, unsigned int out_stride,
const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col);
-template void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height,
- const int32_t *input, unsigned int in_stride, uint8_t *output, unsigned int out_stride,
- const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col);
-
/*
* Routine (and helpers) to compute row sums needed for offset correction.
*
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index e43eb8a09d..a9cbf4ec8d 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
@@ -41,24 +41,24 @@ namespace arm_gemm {
* The optional 'block' parameter is for kernels using dot-product type
* instructions like UDOT and SDOT.
*/
-template<typename TInput, typename TWeight, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
class StdTransformsFixed
{
public:
template<typename TIn>
- void PrepareA(TInput *out, const TIn *in, const int stride, const int y0,
+ void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const {
Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
- void PrepareA_indirect(TInput *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+ void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
- void PrepareA_convolution(TInput *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+ void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
@@ -68,7 +68,7 @@ public:
}
template<typename TIn>
- void PrepareB(TWeight *out, const TIn *in, const int stride, const int x0,
+ void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
const int xmax, const int k0, const int kmax, bool transposed) const {
assert(!transposed);
Transform<width, block, true>(out, in, stride, x0, xmax, k0, kmax);
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp
index ec3cad0385..1db716455f 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp
@@ -42,24 +42,24 @@ namespace arm_gemm {
* The optional 'block' parameter is for kernels using dot-product type
* instructions like UDOT and SDOT.
*/
-template<typename TInput, typename TWeight, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
class StdTransformsFixedTRB
{
public:
template<typename TIn>
- void PrepareA(TInput *out, const TIn *in, const int stride, const int y0,
+ void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const {
Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
- void PrepareA_indirect(TInput *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+ void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
- void PrepareA_convolution(TInput *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+ void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
@@ -69,7 +69,7 @@ public:
}
template<typename TIn>
- void PrepareB(TWeight *out, const TIn *in, const int stride, const int x0,
+ void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
const int xmax, const int k0, const int kmax, bool transposed) const {
if (transposed) {
Transform<width, block, false>(out, in, stride, x0, xmax, k0, kmax);
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
index 32d597f4af..c516bfc456 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
@@ -39,24 +39,24 @@ namespace arm_gemm {
* The optional 'block' parameter is for kernels using dot-product type
* instructions like UDOT and SDOT.
*/
-template<typename TInput, typename TWeight, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1, bool integrate_sums=false>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1, bool integrate_sums=false>
class StdTransformsSVE
{
public:
template<typename TIn>
- void PrepareA(TInput *out, const TIn *in, const int stride, const int y0,
+ void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
- void PrepareA_indirect(TInput *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+ void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
- void PrepareA_convolution(TInput *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+ void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
@@ -66,7 +66,7 @@ public:
}
template<typename TIn>
- void PrepareB(TWeight *out, const TIn *in, const int stride, const int x0,
+ void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
const int xmax, const int k0, const int kmax, bool transposed) {
assert (!transposed);
Transform<width_vectors, block, true, VLType::SVE>(out, in, stride, x0, xmax, k0, kmax);
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 3690727f11..16e0822782 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -61,8 +61,8 @@ void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
template <>
void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
__asm volatile (
- "VLD1.32 {d0-d3}, [%[in0]]!\n"
- "VST1.32 {d0-d3}, [%[out]]\n"
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n"
ASM_PREFETCH("[%[in0], #192]")
: [in0] "+r" (in0),
[out] "+r" (out)
@@ -74,13 +74,13 @@ void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint
template <>
void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
__asm volatile (
- "VLD1.32 {d0-d3}, [%[in0]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
ASM_PREFETCH("[%[in0], #192]")
- "VLD1.32 {d0-d3}, [%[in1]]!\n"
- "VST1.32 {d0-d3}, [%[out]]\n"
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n"
ASM_PREFETCH("[%[in1], #192]")
- "SUB %[out], %[out], #32\n"
+ "SUB %[out], %[out], #32\n"
: [in0] "+r" (in0),
[in1] "+r" (in1),
[out] "+r" (out)
@@ -92,19 +92,19 @@ void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint
template <>
void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
__asm __volatile (
- "VLD1.32 {d0-d3}, [%[in0]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
ASM_PREFETCH("[%[in0], #192]")
- "VLD1.32 {d0-d3}, [%[in1]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
ASM_PREFETCH("[%[in1], #192]")
- "VLD1.32 {d0-d3}, [%[in2]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
+ "VLD1.32 {d0-d3}, [%[in2]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
ASM_PREFETCH("[%[in2], #192]")
- "VLD1.32 {d0-d3}, [%[in3]]!\n"
- "VST1.32 {d0-d3}, [%[out]]\n"
+ "VLD1.32 {d0-d3}, [%[in3]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n"
ASM_PREFETCH("[%[in3], #192]")
- "SUB %[out], %[out], #96\n"
+ "SUB %[out], %[out], #96\n"
: [in0] "+r" (in0),
[in1] "+r" (in1),
[in2] "+r" (in2),
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
index 618992c481..af5ecf5a8b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
@@ -34,281 +34,235 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x20\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
"ldr q15, [x25], #0x10\n"
- "ldr q14, [x22], #0x10\n"
+ "ldr q14, [x23], #0x10\n"
"sub x24, x24, #0x20\n"
- "ldr q13, [x21], #0x10\n"
- "ldr q12, [x20], #0x10\n"
"cmp x24, #0x20\n"
+ "ldr q13, [x22], #0x10\n"
+ "ldr q12, [x20], #0x10\n"
"ldr q11, [x25], #0x10\n"
- "ldr q10, [x22], #0x10\n"
- "ldr q9, [x21], #0x10\n"
+ "ldr q10, [x23], #0x10\n"
+ "ldr q9, [x22], #0x10\n"
"ldr q8, [x20], #0x10\n"
"ldr q7, [x25], #0x10\n"
- "ldr q6, [x22], #0x10\n"
- "ldr q5, [x21], #0x10\n"
+ "ldr q6, [x23], #0x10\n"
+ "ldr q5, [x22], #0x10\n"
"ldr q4, [x20], #0x10\n"
"ldr q3, [x25], #0x10\n"
- "ldr q2, [x22], #0x10\n"
- "ldr q1, [x21], #0x10\n"
+ "ldr q2, [x23], #0x10\n"
+ "ldr q1, [x22], #0x10\n"
"ldr q0, [x20], #0x10\n"
"ldr q31, [x25], #0x10\n"
- "ldr q30, [x22], #0x10\n"
- "ldr q29, [x21], #0x10\n"
+ "ldr q30, [x23], #0x10\n"
+ "ldr q29, [x22], #0x10\n"
"ldr q28, [x20], #0x10\n"
"ldr q27, [x25], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
+ "ldr q26, [x23], #0x10\n"
+ "ldr q25, [x22], #0x10\n"
"ldr q24, [x20], #0x10\n"
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q15, [x23, #0x0]\n"
- "str q11, [x23, #0x10]\n"
- "str q7, [x23, #0x20]\n"
- "str q3, [x23, #0x30]\n"
- "str q31, [x23, #0x40]\n"
- "str q27, [x23, #0x50]\n"
- "str q23, [x23, #0x60]\n"
- "str q19, [x23, #0x70]\n"
- "str q14, [x23, #0x80]\n"
- "str q10, [x23, #0x90]\n"
- "str q6, [x23, #0xa0]\n"
- "str q2, [x23, #0xb0]\n"
- "str q30, [x23, #0xc0]\n"
- "str q26, [x23, #0xd0]\n"
- "str q22, [x23, #0xe0]\n"
- "str q18, [x23, #0xf0]\n"
- "str q13, [x23, #0x100]\n"
- "str q9, [x23, #0x110]\n"
- "str q5, [x23, #0x120]\n"
- "str q1, [x23, #0x130]\n"
- "str q29, [x23, #0x140]\n"
- "str q25, [x23, #0x150]\n"
- "str q21, [x23, #0x160]\n"
- "str q17, [x23, #0x170]\n"
- "str q12, [x23, #0x180]\n"
- "str q8, [x23, #0x190]\n"
- "str q4, [x23, #0x1a0]\n"
- "str q0, [x23, #0x1b0]\n"
- "str q28, [x23, #0x1c0]\n"
- "str q24, [x23, #0x1d0]\n"
- "str q20, [x23, #0x1e0]\n"
- "str q16, [x23, #0x1f0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "str q15, [x21, #0x0]\n"
+ "str q11, [x21, #0x10]\n"
+ "str q7, [x21, #0x20]\n"
+ "str q3, [x21, #0x30]\n"
+ "str q31, [x21, #0x40]\n"
+ "str q27, [x21, #0x50]\n"
+ "str q23, [x21, #0x60]\n"
+ "str q19, [x21, #0x70]\n"
+ "str q14, [x21, #0x80]\n"
+ "str q10, [x21, #0x90]\n"
+ "str q6, [x21, #0xa0]\n"
+ "str q2, [x21, #0xb0]\n"
+ "str q30, [x21, #0xc0]\n"
+ "str q26, [x21, #0xd0]\n"
+ "str q22, [x21, #0xe0]\n"
+ "str q18, [x21, #0xf0]\n"
+ "str q13, [x21, #0x100]\n"
+ "str q9, [x21, #0x110]\n"
+ "str q5, [x21, #0x120]\n"
+ "str q1, [x21, #0x130]\n"
+ "str q29, [x21, #0x140]\n"
+ "str q25, [x21, #0x150]\n"
+ "str q21, [x21, #0x160]\n"
+ "str q17, [x21, #0x170]\n"
+ "str q12, [x21, #0x180]\n"
+ "str q8, [x21, #0x190]\n"
+ "str q4, [x21, #0x1a0]\n"
+ "str q0, [x21, #0x1b0]\n"
+ "str q28, [x21, #0x1c0]\n"
+ "str q24, [x21, #0x1d0]\n"
+ "str q20, [x21, #0x1e0]\n"
+ "str q16, [x21, #0x1f0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x10\n"
- "movi v16.4s, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "str q16, [x23, #0xc0]\n"
- "str q16, [x23, #0xd0]\n"
- "str q16, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
- "str q16, [x23, #0x100]\n"
- "str q16, [x23, #0x110]\n"
- "str q16, [x23, #0x120]\n"
- "str q16, [x23, #0x130]\n"
- "str q16, [x23, #0x140]\n"
- "str q16, [x23, #0x150]\n"
- "str q16, [x23, #0x160]\n"
- "str q16, [x23, #0x170]\n"
- "str q16, [x23, #0x180]\n"
- "str q16, [x23, #0x190]\n"
- "str q16, [x23, #0x1a0]\n"
- "str q16, [x23, #0x1b0]\n"
- "str q16, [x23, #0x1c0]\n"
- "str q16, [x23, #0x1d0]\n"
- "str q16, [x23, #0x1e0]\n"
- "str q16, [x23, #0x1f0]\n"
"blt 5f\n"
"4:" // Main row loop: width 16 loop: loop
"ldr q31, [x25], #0x10\n"
- "ldr q30, [x22], #0x10\n"
+ "ldr q30, [x23], #0x10\n"
"sub x24, x24, #0x10\n"
- "ldr q29, [x21], #0x10\n"
- "ldr q28, [x20], #0x10\n"
"cmp x24, #0x10\n"
+ "ldr q29, [x22], #0x10\n"
+ "ldr q28, [x20], #0x10\n"
"ldr q27, [x25], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
+ "ldr q26, [x23], #0x10\n"
+ "ldr q25, [x22], #0x10\n"
"ldr q24, [x20], #0x10\n"
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q31, [x23, #0x0]\n"
- "str q27, [x23, #0x10]\n"
- "str q23, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "str q30, [x23, #0x80]\n"
- "str q26, [x23, #0x90]\n"
- "str q22, [x23, #0xa0]\n"
- "str q18, [x23, #0xb0]\n"
- "str q29, [x23, #0x100]\n"
- "str q25, [x23, #0x110]\n"
- "str q21, [x23, #0x120]\n"
- "str q17, [x23, #0x130]\n"
- "str q28, [x23, #0x180]\n"
- "str q24, [x23, #0x190]\n"
- "str q20, [x23, #0x1a0]\n"
- "str q16, [x23, #0x1b0]\n"
- "add x23, x23, #0x40\n"
+ "str q31, [x21, #0x0]\n"
+ "str q27, [x21, #0x10]\n"
+ "str q23, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q30, [x21, #0x80]\n"
+ "str q26, [x21, #0x90]\n"
+ "str q22, [x21, #0xa0]\n"
+ "str q18, [x21, #0xb0]\n"
+ "str q29, [x21, #0x100]\n"
+ "str q25, [x21, #0x110]\n"
+ "str q21, [x21, #0x120]\n"
+ "str q17, [x21, #0x130]\n"
+ "str q28, [x21, #0x180]\n"
+ "str q24, [x21, #0x190]\n"
+ "str q20, [x21, #0x1a0]\n"
+ "str q16, [x21, #0x1b0]\n"
+ "add x21, x21, #0x40\n"
"bge 4b\n"
"5:" // Main row loop: width 16 loop: skip
"cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
"sub x24, x24, #0x4\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
"cmp x24, #0x4\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x80]\n"
- "str q17, [x23, #0x100]\n"
- "str q16, [x23, #0x180]\n"
- "add x23, x23, #0x10\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x80]\n"
+ "str q17, [x21, #0x100]\n"
+ "str q16, [x21, #0x180]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr s19, [x25], #0x4\n"
- "ldr s18, [x22], #0x4\n"
+ "ldr s18, [x23], #0x4\n"
"sub x24, x24, #0x1\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
"cmp x24, #0x1\n"
- "str s19, [x23, #0x0]\n"
- "str s18, [x23, #0x80]\n"
- "str s17, [x23, #0x100]\n"
- "str s16, [x23, #0x180]\n"
- "add x23, x23, #0x4\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "str s19, [x21, #0x0]\n"
+ "str s18, [x21, #0x80]\n"
+ "str s17, [x21, #0x100]\n"
+ "str s16, [x21, #0x180]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x200\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x20\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
"ldr q23, [x25], #0x10\n"
- "sub x20, x20, #0x20\n"
"ldr q22, [x25], #0x10\n"
- "ldr q21, [x25], #0x10\n"
+ "sub x20, x20, #0x20\n"
"cmp x20, #0x20\n"
+ "ldr q21, [x25], #0x10\n"
"ldr q20, [x25], #0x10\n"
"ldr q19, [x25], #0x10\n"
"ldr q18, [x25], #0x10\n"
"ldr q17, [x25], #0x10\n"
"ldr q16, [x25], #0x10\n"
- "str q23, [x23, #0x0]\n"
- "str q22, [x23, #0x10]\n"
- "str q21, [x23, #0x20]\n"
- "str q20, [x23, #0x30]\n"
- "str q19, [x23, #0x40]\n"
- "str q18, [x23, #0x50]\n"
- "str q17, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q23, [x21, #0x0]\n"
+ "str q22, [x21, #0x10]\n"
+ "str q21, [x21, #0x20]\n"
+ "str q20, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.4s, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 16 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 16 loop: loop
"ldr q19, [x25], #0x10\n"
- "sub x20, x20, #0x10\n"
"ldr q18, [x25], #0x10\n"
- "ldr q17, [x25], #0x10\n"
+ "sub x20, x20, #0x10\n"
"cmp x20, #0x10\n"
+ "ldr q17, [x25], #0x10\n"
"ldr q16, [x25], #0x10\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "add x23, x23, #0x40\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 16 loop: skip
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, #0x40\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr q16, [x25], #0x10\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
- "str q16, [x23, #0x0]\n"
- "add x23, x23, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr s16, [x25], #0x4\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
- "str s16, [x23, #0x0]\n"
- "add x23, x23, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x80\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
index 7d0460b3a0..ddd426e949 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
@@ -40,377 +40,362 @@ void a64_transpose_interleave_12_1x4(uint8_t *out, const uint8_t *in, size_t wid
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x30\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x30\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q19, [x9], #0x10\n"
- "ldr q18, [x26], #0x10\n"
- "sub x28, x28, #0x30\n"
- "ldr q17, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "cmp x28, #0x30\n"
- "ldr q27, [x23], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
- "ldr q24, [x20], #0x10\n"
- "ldr q23, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "zip1 v1.16b, v19.16b, v17.16b\n"
- "zip1 v0.16b, v18.16b, v16.16b\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip2 v15.16b, v19.16b, v17.16b\n"
- "zip2 v14.16b, v18.16b, v16.16b\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v13.16b, v27.16b, v25.16b\n"
- "zip1 v12.16b, v26.16b, v24.16b\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "sub x24, x24, #0x30\n"
+ "cmp x24, #0x30\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v31.16b, v21.16b, v17.16b\n"
+ "zip1 v22.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v14.16b, v21.16b, v17.16b\n"
+ "zip2 v13.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip2 v11.16b, v27.16b, v25.16b\n"
- "zip2 v10.16b, v26.16b, v24.16b\n"
- "ldr q9, [x9], #0x10\n"
- "ldr q8, [x26], #0x10\n"
- "zip1 v7.16b, v23.16b, v21.16b\n"
- "zip1 v6.16b, v22.16b, v20.16b\n"
- "ldr q31, [x25], #0x10\n"
- "ldr q30, [x24], #0x10\n"
- "zip2 v5.16b, v23.16b, v21.16b\n"
- "zip2 v4.16b, v22.16b, v20.16b\n"
- "ldr q29, [x23], #0x10\n"
- "ldr q28, [x22], #0x10\n"
- "zip1 v27.16b, v19.16b, v17.16b\n"
- "zip1 v26.16b, v18.16b, v16.16b\n"
- "ldr q25, [x21], #0x10\n"
- "ldr q24, [x20], #0x10\n"
- "zip2 v23.16b, v19.16b, v17.16b\n"
- "zip2 v22.16b, v18.16b, v16.16b\n"
- "zip1 v3.16b, v9.16b, v31.16b\n"
- "zip1 v2.16b, v8.16b, v30.16b\n"
- "zip1 v21.16b, v1.16b, v0.16b\n"
- "zip2 v20.16b, v1.16b, v0.16b\n"
- "zip1 v1.16b, v29.16b, v25.16b\n"
- "zip1 v0.16b, v28.16b, v24.16b\n"
- "zip1 v19.16b, v15.16b, v14.16b\n"
- "zip1 v18.16b, v13.16b, v12.16b\n"
- "zip2 v17.16b, v13.16b, v12.16b\n"
- "zip1 v16.16b, v11.16b, v10.16b\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "zip2 v31.16b, v9.16b, v31.16b\n"
- "zip2 v30.16b, v8.16b, v30.16b\n"
- "str q19, [x27, #0x20]\n"
- "zip2 v29.16b, v29.16b, v25.16b\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "str q18, [x27, #0x30]\n"
- "zip2 v21.16b, v15.16b, v14.16b\n"
- "zip1 v20.16b, v7.16b, v6.16b\n"
- "str q17, [x27, #0x40]\n"
- "zip2 v19.16b, v7.16b, v6.16b\n"
- "zip2 v18.16b, v11.16b, v10.16b\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 v17.16b, v27.16b, v26.16b\n"
- "zip2 v16.16b, v27.16b, v26.16b\n"
- "str q21, [x27, #0x0]\n"
- "zip1 v27.16b, v5.16b, v4.16b\n"
- "zip2 v26.16b, v5.16b, v4.16b\n"
- "str q20, [x27, #0x10]\n"
- "zip1 v25.16b, v3.16b, v2.16b\n"
- "zip1 v24.16b, v23.16b, v22.16b\n"
- "str q19, [x27, #0x20]\n"
- "zip2 v23.16b, v23.16b, v22.16b\n"
- "zip1 v22.16b, v1.16b, v0.16b\n"
- "str q18, [x27, #0x30]\n"
- "zip2 v21.16b, v3.16b, v2.16b\n"
- "zip1 v20.16b, v31.16b, v30.16b\n"
- "str q17, [x27, #0x40]\n"
- "zip2 v19.16b, v31.16b, v30.16b\n"
- "zip2 v18.16b, v1.16b, v0.16b\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 v17.16b, v29.16b, v28.16b\n"
- "zip2 v16.16b, v29.16b, v28.16b\n"
- "str q27, [x27, #0x0]\n"
- "str q26, [x27, #0x10]\n"
- "str q25, [x27, #0x20]\n"
- "str q24, [x27, #0x30]\n"
- "str q23, [x27, #0x40]\n"
- "str q22, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "str q18, [x27, #0x30]\n"
- "str q17, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 v30.16b, v19.16b, v17.16b\n"
+ "zip1 v29.16b, v18.16b, v16.16b\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v12.16b, v19.16b, v17.16b\n"
+ "zip2 v11.16b, v18.16b, v16.16b\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v10.16b, v21.16b, v17.16b\n"
+ "zip1 v9.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v8.16b, v21.16b, v17.16b\n"
+ "zip2 v7.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v6.16b, v19.16b, v17.16b\n"
+ "zip1 v5.16b, v18.16b, v16.16b\n"
+ "ldr q28, [x9], #0x10\n"
+ "ldr q27, [x28], #0x10\n"
+ "zip2 v4.16b, v19.16b, v17.16b\n"
+ "zip2 v3.16b, v18.16b, v16.16b\n"
+ "ldr q26, [x27], #0x10\n"
+ "ldr q25, [x26], #0x10\n"
+ "zip1 v2.16b, v28.16b, v26.16b\n"
+ "zip1 v1.16b, v27.16b, v25.16b\n"
+ "ldr q24, [x25], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip1 v16.16b, v31.16b, v22.16b\n"
+ "zip2 v22.16b, v31.16b, v22.16b\n"
+ "ldr q21, [x22], #0x10\n"
+ "ldr q20, [x20], #0x10\n"
+ "zip1 v0.16b, v24.16b, v21.16b\n"
+ "zip1 v31.16b, v23.16b, v20.16b\n"
+ "zip1 v19.16b, v14.16b, v13.16b\n"
+ "zip1 v18.16b, v30.16b, v29.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip2 v16.16b, v30.16b, v29.16b\n"
+ "zip1 v17.16b, v12.16b, v11.16b\n"
+ "str q22, [x21, #0x10]\n"
+ "str q19, [x21, #0x20]\n"
+ "zip2 v30.16b, v28.16b, v26.16b\n"
+ "zip2 v29.16b, v27.16b, v25.16b\n"
+ "str q18, [x21, #0x30]\n"
+ "zip2 v28.16b, v24.16b, v21.16b\n"
+ "zip2 v27.16b, v23.16b, v20.16b\n"
+ "str q16, [x21, #0x40]\n"
+ "zip2 v21.16b, v14.16b, v13.16b\n"
+ "zip1 v16.16b, v10.16b, v9.16b\n"
+ "str q17, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 v20.16b, v10.16b, v9.16b\n"
+ "zip2 v19.16b, v12.16b, v11.16b\n"
+ "zip1 v18.16b, v6.16b, v5.16b\n"
+ "zip2 v17.16b, v6.16b, v5.16b\n"
+ "str q21, [x21, #0x0]\n"
+ "str q16, [x21, #0x10]\n"
+ "zip1 v16.16b, v8.16b, v7.16b\n"
+ "zip2 v26.16b, v8.16b, v7.16b\n"
+ "str q20, [x21, #0x20]\n"
+ "zip1 v25.16b, v2.16b, v1.16b\n"
+ "zip1 v24.16b, v4.16b, v3.16b\n"
+ "str q19, [x21, #0x30]\n"
+ "zip2 v23.16b, v4.16b, v3.16b\n"
+ "zip1 v22.16b, v0.16b, v31.16b\n"
+ "str q18, [x21, #0x40]\n"
+ "zip2 v21.16b, v2.16b, v1.16b\n"
+ "zip1 v20.16b, v30.16b, v29.16b\n"
+ "str q17, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 v19.16b, v30.16b, v29.16b\n"
+ "zip2 v18.16b, v0.16b, v31.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v17.16b, v28.16b, v27.16b\n"
+ "zip2 v16.16b, v28.16b, v27.16b\n"
+ "str q26, [x21, #0x10]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q24, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q21, [x21, #0x0]\n"
+ "str q20, [x21, #0x10]\n"
+ "str q19, [x21, #0x20]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0xc\n"
+ "cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr d19, [x9], #0x8\n"
- "ldr d23, [x26], #0x8\n"
- "sub x28, x28, #0xc\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d17, [x24], #0x8\n"
- "cmp x28, #0xc\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
- "ldr d21, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
- "ld1 { v19.s }[2], [x9], #0x4\n"
- "ld1 { v23.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v17.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
- "ld1 { v21.s }[2], [x21], #0x4\n"
- "ld1 { v16.s }[2], [x20], #0x4\n"
- "zip1 v24.16b, v19.16b, v18.16b\n"
- "zip1 v20.16b, v23.16b, v17.16b\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "zip2 v18.16b, v23.16b, v17.16b\n"
- "zip1 v23.16b, v22.16b, v21.16b\n"
- "zip1 v17.16b, v25.16b, v16.16b\n"
- "zip2 v22.16b, v22.16b, v21.16b\n"
- "zip2 v16.16b, v25.16b, v16.16b\n"
- "zip1 v21.16b, v24.16b, v20.16b\n"
- "zip2 v20.16b, v24.16b, v20.16b\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d22, [x28], #0x8\n"
+ "sub x24, x24, #0xc\n"
+ "cmp x24, #0xc\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d17, [x20], #0x8\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v22.s }[2], [x28], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v18.s }[2], [x26], #0x4\n"
+ "zip1 v24.16b, v23.16b, v19.16b\n"
+ "zip1 v16.16b, v22.16b, v18.16b\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "zip2 v19.16b, v23.16b, v19.16b\n"
+ "zip2 v18.16b, v22.16b, v18.16b\n"
+ "ld1 { v20.s }[2], [x22], #0x4\n"
+ "ld1 { v17.s }[2], [x20], #0x4\n"
+ "zip1 v23.16b, v21.16b, v20.16b\n"
+ "zip1 v22.16b, v25.16b, v17.16b\n"
+ "zip2 v21.16b, v21.16b, v20.16b\n"
+ "zip2 v20.16b, v25.16b, v17.16b\n"
+ "zip1 v17.16b, v24.16b, v16.16b\n"
+ "zip2 v16.16b, v24.16b, v16.16b\n"
+ "str q17, [x21, #0x0]\n"
"zip1 v19.16b, v19.16b, v18.16b\n"
- "zip1 v18.16b, v23.16b, v17.16b\n"
- "zip2 v17.16b, v23.16b, v17.16b\n"
- "zip1 v16.16b, v22.16b, v16.16b\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "str q18, [x27, #0x30]\n"
- "str q17, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 v18.16b, v23.16b, v22.16b\n"
+ "str q16, [x21, #0x10]\n"
+ "zip2 v17.16b, v23.16b, v22.16b\n"
+ "zip1 v16.16b, v21.16b, v20.16b\n"
+ "str q19, [x21, #0x20]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "sub x28, x28, #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s17, [x24], #0x4\n"
- "cmp x28, #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v19.16b, v23.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v21.16b, v18.16b\n"
- "zip1 v16.16b, v20.16b, v16.16b\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
- "str q17, [x27, #0x0]\n"
- "str q16, [x27, #0x30]\n"
- "add x27, x27, #0x10\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str q18, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "sub x28, x28, #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b17, [x24], #0x1\n"
- "cmp x28, #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b20, [x22], #0x1\n"
- "ldr b18, [x21], #0x1\n"
- "ldr b16, [x20], #0x1\n"
- "zip1 v19.16b, v23.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v21.16b, v18.16b\n"
- "zip1 v16.16b, v20.16b, v16.16b\n"
+ "ldr b19, [x9], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b16, [x26], #0x1\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
- "str s17, [x27, #0x0]\n"
- "str s16, [x27, #0x30]\n"
- "add x27, x27, #0x4\n"
+ "ldr b20, [x25], #0x1\n"
+ "ldr b19, [x23], #0x1\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x22], #0x1\n"
+ "ldr b16, [x20], #0x1\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str s18, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str s16, [x21, #0x30]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0x60\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x9, %x[in]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x26, x27, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "csel x25, x25, %x[pad_row], GE\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x26, %x[in_stride]\n"
"csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x30\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q27, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
"sub x20, x20, #0x30\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x30\n"
- "ldr q26, [x9], #0x10\n"
- "ldr q25, [x26], #0x10\n"
- "ldr q20, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "ldr q24, [x9], #0x10\n"
- "zip1 v18.16b, v27.16b, v21.16b\n"
- "zip1 v16.16b, v22.16b, v17.16b\n"
- "ldr q30, [x26], #0x10\n"
- "ldr q23, [x25], #0x10\n"
- "zip2 v29.16b, v27.16b, v21.16b\n"
- "zip2 v28.16b, v22.16b, v17.16b\n"
- "ldr q17, [x24], #0x10\n"
- "zip1 v22.16b, v26.16b, v20.16b\n"
- "zip1 v21.16b, v25.16b, v19.16b\n"
- "zip2 v27.16b, v26.16b, v20.16b\n"
- "zip2 v20.16b, v25.16b, v19.16b\n"
- "zip1 v26.16b, v24.16b, v23.16b\n"
- "zip1 v25.16b, v30.16b, v17.16b\n"
- "zip1 v19.16b, v18.16b, v16.16b\n"
- "zip2 v16.16b, v18.16b, v16.16b\n"
- "zip1 v18.16b, v29.16b, v28.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "zip2 v23.16b, v30.16b, v17.16b\n"
- "zip2 v17.16b, v29.16b, v28.16b\n"
- "str q19, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "zip1 v16.16b, v22.16b, v21.16b\n"
- "zip2 v22.16b, v22.16b, v21.16b\n"
- "str q18, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 v21.16b, v27.16b, v20.16b\n"
- "zip2 v20.16b, v27.16b, v20.16b\n"
- "str q17, [x27, #0x0]\n"
- "zip1 v19.16b, v26.16b, v25.16b\n"
- "zip2 v18.16b, v26.16b, v25.16b\n"
- "str q16, [x27, #0x10]\n"
- "zip1 v17.16b, v24.16b, v23.16b\n"
- "zip2 v16.16b, v24.16b, v23.16b\n"
- "str q22, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q18, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v31.16b, v21.16b, v17.16b\n"
+ "zip1 v30.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v29.16b, v21.16b, v17.16b\n"
+ "zip2 v28.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v27.16b, v19.16b, v17.16b\n"
+ "zip1 v26.16b, v18.16b, v16.16b\n"
+ "ldr q22, [x9], #0x10\n"
+ "ldr q21, [x28], #0x10\n"
+ "zip2 v25.16b, v19.16b, v17.16b\n"
+ "zip2 v20.16b, v18.16b, v16.16b\n"
+ "ldr q19, [x27], #0x10\n"
+ "ldr q18, [x26], #0x10\n"
+ "zip1 v24.16b, v22.16b, v19.16b\n"
+ "zip1 v23.16b, v21.16b, v18.16b\n"
+ "zip1 v16.16b, v31.16b, v30.16b\n"
+ "zip2 v17.16b, v31.16b, v30.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.16b, v29.16b, v28.16b\n"
+ "str q17, [x21, #0x10]\n"
+ "zip2 v22.16b, v22.16b, v19.16b\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 v21.16b, v21.16b, v18.16b\n"
+ "zip2 v18.16b, v29.16b, v28.16b\n"
+ "zip1 v16.16b, v27.16b, v26.16b\n"
+ "zip2 v17.16b, v27.16b, v26.16b\n"
+ "str q18, [x21, #0x0]\n"
+ "str q16, [x21, #0x10]\n"
+ "zip1 v16.16b, v25.16b, v20.16b\n"
+ "zip2 v20.16b, v25.16b, v20.16b\n"
+ "str q17, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip1 v19.16b, v24.16b, v23.16b\n"
+ "zip2 v18.16b, v24.16b, v23.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v17.16b, v22.16b, v21.16b\n"
+ "zip2 v16.16b, v22.16b, v21.16b\n"
+ "str q20, [x21, #0x10]\n"
+ "str q19, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr d19, [x9], #0x8\n"
- "ldr d21, [x26], #0x8\n"
+ "ldr d21, [x28], #0x8\n"
"sub x20, x20, #0xc\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d16, [x24], #0x8\n"
"cmp x20, #0xc\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
"ld1 { v19.s }[2], [x9], #0x4\n"
- "ld1 { v21.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
- "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x28], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v16.s }[2], [x26], #0x4\n"
"zip1 v20.16b, v19.16b, v18.16b\n"
"zip1 v17.16b, v21.16b, v16.16b\n"
"zip2 v19.16b, v19.16b, v18.16b\n"
- "zip2 v16.16b, v21.16b, v16.16b\n"
- "zip1 v18.16b, v20.16b, v17.16b\n"
+ "zip2 v18.16b, v21.16b, v16.16b\n"
+ "zip1 v16.16b, v20.16b, v17.16b\n"
"zip2 v17.16b, v20.16b, v17.16b\n"
- "zip1 v16.16b, v19.16b, v16.16b\n"
- "str q18, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr s19, [x9], #0x4\n"
- "ldr s18, [x26], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
"sub x20, x20, #0x4\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s16, [x24], #0x4\n"
"cmp x20, #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr b19, [x9], #0x1\n"
- "ldr b18, [x26], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
"sub x20, x20, #0x1\n"
- "ldr b17, [x25], #0x1\n"
- "ldr b16, [x24], #0x1\n"
"cmp x20, #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b16, [x26], #0x1\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str s16, [x27, #0x0]\n"
- "add x27, x27, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x30\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
index 1eb49e290a..4e76689523 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
@@ -41,268 +41,259 @@ void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t wid
__asm__ __volatile__(
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "cmp %x[height], #0x7\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GE\n"
- "add %x[in], x20, %x[in_stride]\n"
- "csel x20, x20, %x[pad_row], GT\n"
- "cmp %x[height], #0x5\n"
+ "cmp %x[height], #0x7\n"
+ "add %x[in], x22, %x[in_stride]\n"
"csel x22, x22, %x[pad_row], GT\n"
"csel x23, x23, %x[pad_row], GE\n"
- "cmp %x[height], #0x3\n"
+ "cmp %x[height], #0x5\n"
+ "mov x21, %x[width]\n"
"csel x24, x24, %x[pad_row], GT\n"
"csel x25, x25, %x[pad_row], GE\n"
+ "cmp %x[height], #0x3\n"
+ "csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
+ "cmp x21, #0x30\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x8\n"
- "csel x26, x26, %x[pad_row], GT\n"
- "cmp x28, #0x30\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q1, [x9], #0x10\n"
- "ldr q0, [x26], #0x10\n"
- "sub x28, x28, #0x30\n"
- "ldr q31, [x25], #0x10\n"
- "ldr q28, [x24], #0x10\n"
- "cmp x28, #0x30\n"
- "ldr q27, [x23], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
- "ldr q24, [x20], #0x10\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q25, [x28], #0x10\n"
+ "sub x21, x21, #0x30\n"
+ "cmp x21, #0x30\n"
+ "ldr q20, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x24], #0x10\n"
+ "zip1 v7.16b, v21.16b, v19.16b\n"
+ "zip1 v6.16b, v25.16b, v18.16b\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ "zip1 v28.16b, v20.16b, v17.16b\n"
+ "zip1 v27.16b, v24.16b, v16.16b\n"
"ldr q23, [x9], #0x10\n"
- "ldr q30, [x26], #0x10\n"
- "ldr q22, [x25], #0x10\n"
- "ldr q21, [x24], #0x10\n"
- "zip1 v15.16b, v1.16b, v27.16b\n"
- "zip1 v9.16b, v0.16b, v26.16b\n"
- "ldr q20, [x23], #0x10\n"
- "ldr q19, [x22], #0x10\n"
- "zip1 v18.16b, v31.16b, v25.16b\n"
- "zip1 v29.16b, v28.16b, v24.16b\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip2 v14.16b, v1.16b, v27.16b\n"
- "zip2 v13.16b, v31.16b, v25.16b\n"
- "ldr q8, [x9], #0x10\n"
- "ldr q7, [x26], #0x10\n"
- "zip2 v12.16b, v0.16b, v26.16b\n"
- "zip2 v6.16b, v28.16b, v24.16b\n"
- "ldr q5, [x25], #0x10\n"
- "ldr q4, [x24], #0x10\n"
- "zip1 v3.16b, v23.16b, v20.16b\n"
- "zip1 v11.16b, v30.16b, v19.16b\n"
- "ldr q28, [x23], #0x10\n"
- "ldr q27, [x22], #0x10\n"
- "zip1 v2.16b, v22.16b, v17.16b\n"
- "zip1 v1.16b, v21.16b, v16.16b\n"
- "ldr q26, [x21], #0x10\n"
- "ldr q25, [x20], #0x10\n"
- "zip2 v24.16b, v23.16b, v20.16b\n"
- "zip2 v23.16b, v22.16b, v17.16b\n"
- "zip2 v22.16b, v30.16b, v19.16b\n"
- "zip2 v21.16b, v21.16b, v16.16b\n"
- "zip1 v0.16b, v8.16b, v28.16b\n"
- "zip1 v10.16b, v7.16b, v27.16b\n"
- "zip1 v31.16b, v5.16b, v26.16b\n"
- "zip1 v30.16b, v4.16b, v25.16b\n"
- "zip1 v20.16b, v15.16b, v18.16b\n"
- "zip1 v19.16b, v9.16b, v29.16b\n"
- "zip2 v18.16b, v15.16b, v18.16b\n"
- "zip2 v16.16b, v9.16b, v29.16b\n"
- "zip1 v29.16b, v14.16b, v13.16b\n"
- "zip1 v17.16b, v12.16b, v6.16b\n"
- "zip2 v9.16b, v8.16b, v28.16b\n"
- "zip2 v28.16b, v5.16b, v26.16b\n"
- "zip2 v8.16b, v7.16b, v27.16b\n"
- "zip2 v27.16b, v4.16b, v25.16b\n"
- "zip2 v7.16b, v14.16b, v13.16b\n"
- "zip2 v6.16b, v12.16b, v6.16b\n"
- "zip1 v5.16b, v3.16b, v2.16b\n"
- "zip1 v4.16b, v11.16b, v1.16b\n"
- "zip2 v3.16b, v3.16b, v2.16b\n"
- "zip2 v2.16b, v11.16b, v1.16b\n"
- "zip1 v26.16b, v24.16b, v23.16b\n"
- "zip1 v25.16b, v22.16b, v21.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "zip2 v23.16b, v22.16b, v21.16b\n"
- "zip1 v1.16b, v0.16b, v31.16b\n"
- "zip1 v22.16b, v10.16b, v30.16b\n"
- "zip1 v21.16b, v20.16b, v19.16b\n"
- "zip2 v20.16b, v20.16b, v19.16b\n"
- "zip1 v19.16b, v18.16b, v16.16b\n"
- "zip2 v18.16b, v18.16b, v16.16b\n"
- "zip1 v16.16b, v29.16b, v17.16b\n"
- "zip2 v17.16b, v29.16b, v17.16b\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "zip2 v0.16b, v0.16b, v31.16b\n"
- "zip2 v31.16b, v10.16b, v30.16b\n"
- "str q19, [x27, #0x20]\n"
- "zip1 v30.16b, v9.16b, v28.16b\n"
- "zip1 v29.16b, v8.16b, v27.16b\n"
- "str q18, [x27, #0x30]\n"
- "zip2 v28.16b, v9.16b, v28.16b\n"
- "zip2 v27.16b, v8.16b, v27.16b\n"
- "str q16, [x27, #0x40]\n"
- "zip1 v21.16b, v7.16b, v6.16b\n"
- "zip2 v16.16b, v7.16b, v6.16b\n"
- "str q17, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 v20.16b, v5.16b, v4.16b\n"
- "zip2 v19.16b, v5.16b, v4.16b\n"
+ "ldr q22, [x28], #0x10\n"
+ "zip2 v5.16b, v21.16b, v19.16b\n"
+ "zip2 v4.16b, v20.16b, v17.16b\n"
+ "ldr q21, [x27], #0x10\n"
+ "ldr q20, [x26], #0x10\n"
+ "zip2 v3.16b, v25.16b, v18.16b\n"
+ "zip2 v2.16b, v24.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x24], #0x10\n"
+ "zip1 v1.16b, v23.16b, v19.16b\n"
+ "zip1 v15.16b, v22.16b, v18.16b\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ "zip1 v0.16b, v21.16b, v17.16b\n"
+ "zip1 v31.16b, v20.16b, v16.16b\n"
+ "ldr q26, [x9], #0x10\n"
+ "ldr q30, [x28], #0x10\n"
+ "zip2 v14.16b, v23.16b, v19.16b\n"
+ "zip2 v13.16b, v21.16b, v17.16b\n"
+ "ldr q25, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip2 v12.16b, v22.16b, v18.16b\n"
+ "zip2 v11.16b, v20.16b, v16.16b\n"
+ "ldr q23, [x25], #0x10\n"
+ "ldr q22, [x24], #0x10\n"
+ "zip1 v10.16b, v26.16b, v23.16b\n"
+ "zip1 v9.16b, v30.16b, v22.16b\n"
+ "ldr q21, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
+ "zip1 v29.16b, v25.16b, v21.16b\n"
+ "zip1 v8.16b, v24.16b, v17.16b\n"
+ "zip1 v19.16b, v7.16b, v28.16b\n"
+ "zip1 v16.16b, v6.16b, v27.16b\n"
+ "zip2 v28.16b, v7.16b, v28.16b\n"
+ "zip2 v18.16b, v6.16b, v27.16b\n"
+ "zip1 v27.16b, v5.16b, v4.16b\n"
+ "zip1 v20.16b, v3.16b, v2.16b\n"
+ "zip2 v7.16b, v26.16b, v23.16b\n"
+ "zip2 v26.16b, v25.16b, v21.16b\n"
+ "zip2 v6.16b, v30.16b, v22.16b\n"
+ "zip2 v25.16b, v24.16b, v17.16b\n"
+ "zip2 v5.16b, v5.16b, v4.16b\n"
+ "zip2 v4.16b, v3.16b, v2.16b\n"
+ "zip1 v3.16b, v1.16b, v0.16b\n"
+ "zip1 v2.16b, v15.16b, v31.16b\n"
+ "zip2 v1.16b, v1.16b, v0.16b\n"
+ "zip2 v0.16b, v15.16b, v31.16b\n"
+ "zip1 v31.16b, v14.16b, v13.16b\n"
+ "zip1 v30.16b, v12.16b, v11.16b\n"
+ "zip2 v24.16b, v14.16b, v13.16b\n"
+ "zip2 v23.16b, v12.16b, v11.16b\n"
+ "zip1 v22.16b, v10.16b, v29.16b\n"
+ "zip1 v21.16b, v9.16b, v8.16b\n"
+ "zip1 v17.16b, v19.16b, v16.16b\n"
+ "zip2 v16.16b, v19.16b, v16.16b\n"
+ "str q17, [x20, #0x0]\n"
+ "zip1 v19.16b, v28.16b, v18.16b\n"
+ "zip2 v18.16b, v28.16b, v18.16b\n"
+ "str q16, [x20, #0x10]\n"
+ "zip1 v17.16b, v27.16b, v20.16b\n"
+ "zip2 v16.16b, v27.16b, v20.16b\n"
+ "str q19, [x20, #0x20]\n"
+ "str q18, [x20, #0x30]\n"
+ "zip2 v29.16b, v10.16b, v29.16b\n"
+ "zip2 v20.16b, v9.16b, v8.16b\n"
+ "str q17, [x20, #0x40]\n"
+ "zip1 v28.16b, v7.16b, v26.16b\n"
+ "zip1 v27.16b, v6.16b, v25.16b\n"
+ "str q16, [x20, #0x50]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip2 v26.16b, v7.16b, v26.16b\n"
+ "zip2 v25.16b, v6.16b, v25.16b\n"
+ "zip1 v17.16b, v5.16b, v4.16b\n"
+ "zip2 v16.16b, v5.16b, v4.16b\n"
+ "str q17, [x20, #0x0]\n"
"zip1 v18.16b, v3.16b, v2.16b\n"
"zip2 v17.16b, v3.16b, v2.16b\n"
- "str q21, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "zip1 v16.16b, v26.16b, v25.16b\n"
- "zip2 v26.16b, v26.16b, v25.16b\n"
- "str q20, [x27, #0x20]\n"
- "zip1 v25.16b, v24.16b, v23.16b\n"
+ "str q16, [x20, #0x10]\n"
+ "zip1 v16.16b, v1.16b, v0.16b\n"
+ "zip2 v19.16b, v1.16b, v0.16b\n"
+ "str q18, [x20, #0x20]\n"
+ "str q17, [x20, #0x30]\n"
+ "zip1 v18.16b, v31.16b, v30.16b\n"
+ "zip2 v17.16b, v31.16b, v30.16b\n"
+ "str q16, [x20, #0x40]\n"
+ "zip1 v16.16b, v24.16b, v23.16b\n"
"zip2 v24.16b, v24.16b, v23.16b\n"
- "str q19, [x27, #0x30]\n"
- "zip1 v23.16b, v1.16b, v22.16b\n"
- "zip2 v22.16b, v1.16b, v22.16b\n"
- "str q18, [x27, #0x40]\n"
- "zip1 v21.16b, v0.16b, v31.16b\n"
- "zip2 v20.16b, v0.16b, v31.16b\n"
- "str q17, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 v19.16b, v30.16b, v29.16b\n"
- "zip2 v18.16b, v30.16b, v29.16b\n"
- "str q16, [x27, #0x0]\n"
- "zip1 v17.16b, v28.16b, v27.16b\n"
- "zip2 v16.16b, v28.16b, v27.16b\n"
- "str q26, [x27, #0x10]\n"
- "str q25, [x27, #0x20]\n"
- "str q24, [x27, #0x30]\n"
- "str q23, [x27, #0x40]\n"
- "str q22, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "str q18, [x27, #0x30]\n"
- "str q17, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q19, [x20, #0x50]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip1 v23.16b, v22.16b, v21.16b\n"
+ "zip2 v22.16b, v22.16b, v21.16b\n"
+ "str q18, [x20, #0x0]\n"
+ "zip1 v21.16b, v29.16b, v20.16b\n"
+ "zip2 v20.16b, v29.16b, v20.16b\n"
+ "str q17, [x20, #0x10]\n"
+ "zip1 v19.16b, v28.16b, v27.16b\n"
+ "zip2 v18.16b, v28.16b, v27.16b\n"
+ "str q16, [x20, #0x20]\n"
+ "zip1 v17.16b, v26.16b, v25.16b\n"
+ "zip2 v16.16b, v26.16b, v25.16b\n"
+ "str q24, [x20, #0x30]\n"
+ "str q23, [x20, #0x40]\n"
+ "str q22, [x20, #0x50]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "str q21, [x20, #0x0]\n"
+ "str q20, [x20, #0x10]\n"
+ "str q19, [x20, #0x20]\n"
+ "str q18, [x20, #0x30]\n"
+ "str q17, [x20, #0x40]\n"
+ "str q16, [x20, #0x50]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0xc\n"
+ "cmp x21, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr d22, [x9], #0x8\n"
- "ldr d23, [x26], #0x8\n"
- "sub x28, x28, #0xc\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "cmp x28, #0xc\n"
- "ldr d18, [x23], #0x8\n"
- "ldr d17, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
- "ld1 { v22.s }[2], [x9], #0x4\n"
- "ld1 { v23.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v18.s }[2], [x23], #0x4\n"
- "ld1 { v17.s }[2], [x22], #0x4\n"
- "ld1 { v19.s }[2], [x21], #0x4\n"
- "ld1 { v16.s }[2], [x20], #0x4\n"
- "zip1 v25.16b, v22.16b, v18.16b\n"
- "zip1 v24.16b, v23.16b, v17.16b\n"
- "zip2 v22.16b, v22.16b, v18.16b\n"
- "zip2 v23.16b, v23.16b, v17.16b\n"
- "zip1 v18.16b, v21.16b, v19.16b\n"
- "zip1 v17.16b, v20.16b, v16.16b\n"
- "zip2 v21.16b, v21.16b, v19.16b\n"
- "zip2 v16.16b, v20.16b, v16.16b\n"
- "zip1 v20.16b, v25.16b, v18.16b\n"
- "zip1 v19.16b, v24.16b, v17.16b\n"
- "zip2 v18.16b, v25.16b, v18.16b\n"
- "zip2 v17.16b, v24.16b, v17.16b\n"
- "zip1 v22.16b, v22.16b, v21.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "zip1 v21.16b, v20.16b, v19.16b\n"
- "zip2 v20.16b, v20.16b, v19.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
- "zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v22.16b, v16.16b\n"
- "zip2 v16.16b, v22.16b, v16.16b\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "str q18, [x27, #0x30]\n"
- "str q17, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d27, [x28], #0x8\n"
+ "sub x21, x21, #0xc\n"
+ "cmp x21, #0xc\n"
+ "ldr d21, [x27], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v27.s }[2], [x28], #0x4\n"
+ "ld1 { v21.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "zip1 v25.16b, v23.16b, v20.16b\n"
+ "zip1 v24.16b, v27.16b, v19.16b\n"
+ "ld1 { v17.s }[2], [x23], #0x4\n"
+ "ld1 { v16.s }[2], [x22], #0x4\n"
+ "zip1 v22.16b, v21.16b, v17.16b\n"
+ "zip1 v18.16b, v26.16b, v16.16b\n"
+ "zip2 v23.16b, v23.16b, v20.16b\n"
+ "zip2 v21.16b, v21.16b, v17.16b\n"
+ "zip2 v20.16b, v27.16b, v19.16b\n"
+ "zip2 v17.16b, v26.16b, v16.16b\n"
+ "zip1 v19.16b, v25.16b, v22.16b\n"
+ "zip1 v16.16b, v24.16b, v18.16b\n"
+ "zip2 v22.16b, v25.16b, v22.16b\n"
+ "zip2 v18.16b, v24.16b, v18.16b\n"
+ "zip1 v21.16b, v23.16b, v21.16b\n"
+ "zip1 v20.16b, v20.16b, v17.16b\n"
+ "zip1 v17.16b, v19.16b, v16.16b\n"
+ "zip2 v16.16b, v19.16b, v16.16b\n"
+ "str q17, [x20, #0x0]\n"
+ "zip1 v19.16b, v22.16b, v18.16b\n"
+ "zip2 v18.16b, v22.16b, v18.16b\n"
+ "str q16, [x20, #0x10]\n"
+ "zip1 v17.16b, v21.16b, v20.16b\n"
+ "zip2 v16.16b, v21.16b, v20.16b\n"
+ "str q19, [x20, #0x20]\n"
+ "str q18, [x20, #0x30]\n"
+ "str q17, [x20, #0x40]\n"
+ "str q16, [x20, #0x50]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
+ "cmp x21, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "sub x28, x28, #0x4\n"
- "ldr s21, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "cmp x28, #0x4\n"
- "ldr s18, [x23], #0x4\n"
- "ldr s19, [x22], #0x4\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v18.16b, v23.16b, v18.16b\n"
- "zip1 v19.16b, v22.16b, v19.16b\n"
+ "ldr s18, [x9], #0x4\n"
+ "ldr s19, [x28], #0x4\n"
+ "sub x21, x21, #0x4\n"
+ "cmp x21, #0x4\n"
+ "ldr s21, [x27], #0x4\n"
+ "ldr s20, [x26], #0x4\n"
+ "ldr s17, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "zip1 v18.16b, v18.16b, v17.16b\n"
+ "zip1 v19.16b, v19.16b, v16.16b\n"
+ "ldr s17, [x23], #0x4\n"
+ "ldr s16, [x22], #0x4\n"
"zip1 v17.16b, v21.16b, v17.16b\n"
"zip1 v16.16b, v20.16b, v16.16b\n"
"zip1 v18.16b, v18.16b, v17.16b\n"
"zip1 v16.16b, v19.16b, v16.16b\n"
"zip1 v17.16b, v18.16b, v16.16b\n"
"zip2 v16.16b, v18.16b, v16.16b\n"
- "str q17, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
+ "str q17, [x20, #0x0]\n"
+ "str q16, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x21, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "sub x28, x28, #0x1\n"
- "ldr b21, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "cmp x28, #0x1\n"
- "ldr b19, [x23], #0x1\n"
- "ldr b18, [x22], #0x1\n"
- "ldr b17, [x21], #0x1\n"
- "ldr b16, [x20], #0x1\n"
- "zip1 v19.16b, v23.16b, v19.16b\n"
- "zip1 v18.16b, v22.16b, v18.16b\n"
+ "ldr b19, [x9], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
+ "sub x21, x21, #0x1\n"
+ "cmp x21, #0x1\n"
+ "ldr b21, [x27], #0x1\n"
+ "ldr b20, [x26], #0x1\n"
+ "ldr b17, [x25], #0x1\n"
+ "ldr b16, [x24], #0x1\n"
+ "zip1 v19.16b, v19.16b, v17.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "ldr b17, [x23], #0x1\n"
+ "ldr b16, [x22], #0x1\n"
"zip1 v17.16b, v21.16b, v17.16b\n"
"zip1 v16.16b, v20.16b, v16.16b\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
+ "str d16, [x20, #0x0]\n"
+ "add x20, x20, #0x8\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x60\n"
"bge 1b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
index 94726570d4..eafa06ece1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
@@ -40,308 +40,287 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x18\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q22, [x9], #0x10\n"
- "ldr q21, [x26], #0x10\n"
- "sub x28, x28, #0x18\n"
- "ldr q20, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "cmp x28, #0x18\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q23, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v8.8h, v22.8h, v21.8h\n"
- "zip2 v7.8h, v22.8h, v21.8h\n"
- "ldr q22, [x9], #0x10\n"
- "ldr q21, [x26], #0x10\n"
- "zip1 v6.8h, v20.8h, v19.8h\n"
- "zip2 v5.8h, v20.8h, v19.8h\n"
- "ldr q20, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "zip1 v4.8h, v18.8h, v17.8h\n"
- "zip2 v3.8h, v18.8h, v17.8h\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "sub x24, x24, #0x18\n"
+ "zip1 v10.8h, v19.8h, v18.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip2 v9.8h, v19.8h, v18.8h\n"
+ "zip1 v8.8h, v17.8h, v16.8h\n"
+ "ldr q19, [x25], #0x10\n"
"ldr q18, [x23], #0x10\n"
+ "zip2 v7.8h, v17.8h, v16.8h\n"
+ "zip1 v6.8h, v19.8h, v18.8h\n"
"ldr q17, [x22], #0x10\n"
- "zip1 v2.8h, v23.8h, v16.8h\n"
- "zip2 v1.8h, v23.8h, v16.8h\n"
- "ldr q23, [x21], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip1 v0.8h, v22.8h, v21.8h\n"
- "zip2 v31.8h, v22.8h, v21.8h\n"
- "ldr q22, [x9], #0x10\n"
- "ldr q21, [x26], #0x10\n"
- "zip1 v30.8h, v20.8h, v19.8h\n"
- "zip2 v29.8h, v20.8h, v19.8h\n"
+ "zip2 v5.8h, v19.8h, v18.8h\n"
+ "zip1 v4.8h, v17.8h, v16.8h\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip1 v3.8h, v21.8h, v18.8h\n"
+ "zip2 v2.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v1.8h, v17.8h, v16.8h\n"
+ "cmp x24, #0x18\n"
"ldr q20, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "zip1 v28.8h, v18.8h, v17.8h\n"
- "zip2 v27.8h, v18.8h, v17.8h\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "zip1 v26.8h, v23.8h, v16.8h\n"
- "zip2 v25.8h, v23.8h, v16.8h\n"
- "ldr q24, [x21], #0x10\n"
+ "ldr q19, [x23], #0x10\n"
+ "zip1 v0.8h, v20.8h, v19.8h\n"
+ "zip2 v31.8h, v21.8h, v18.8h\n"
+ "ldr q30, [x22], #0x10\n"
+ "ldr q29, [x20], #0x10\n"
+ "zip1 v28.8h, v30.8h, v29.8h\n"
+ "zip2 v27.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
+ "zip1 v26.8h, v17.8h, v16.8h\n"
+ "zip2 v25.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v24.8h, v17.8h, v16.8h\n"
+ "zip2 v23.8h, v17.8h, v16.8h\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
+ "zip2 v22.8h, v20.8h, v19.8h\n"
+ "zip1 v21.8h, v18.8h, v17.8h\n"
+ "ldr q20, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q8, [x27, #0x0]\n"
- "zip1 v23.8h, v22.8h, v21.8h\n"
- "str q7, [x27, #0x10]\n"
- "zip2 v22.8h, v22.8h, v21.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "str q0, [x27, #0x20]\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "str q6, [x27, #0x30]\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "zip1 v17.8h, v24.8h, v16.8h\n"
- "str q5, [x27, #0x40]\n"
- "zip2 v16.8h, v24.8h, v16.8h\n"
- "str q30, [x27, #0x50]\n"
- "str q4, [x27, #0x60]\n"
- "str q3, [x27, #0x70]\n"
- "str q28, [x27, #0x80]\n"
- "str q2, [x27, #0x90]\n"
- "str q1, [x27, #0xa0]\n"
- "str q26, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q31, [x27, #0x0]\n"
- "str q23, [x27, #0x10]\n"
- "str q22, [x27, #0x20]\n"
- "str q29, [x27, #0x30]\n"
- "str q21, [x27, #0x40]\n"
- "str q20, [x27, #0x50]\n"
- "str q27, [x27, #0x60]\n"
- "str q19, [x27, #0x70]\n"
- "str q18, [x27, #0x80]\n"
- "str q25, [x27, #0x90]\n"
- "str q17, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q10, [x21, #0x0]\n"
+ "zip2 v19.8h, v18.8h, v17.8h\n"
+ "str q9, [x21, #0x10]\n"
+ "zip2 v18.8h, v30.8h, v29.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q3, [x21, #0x20]\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
+ "str q8, [x21, #0x30]\n"
+ "str q7, [x21, #0x40]\n"
+ "str q1, [x21, #0x50]\n"
+ "str q6, [x21, #0x60]\n"
+ "str q5, [x21, #0x70]\n"
+ "str q0, [x21, #0x80]\n"
+ "str q4, [x21, #0x90]\n"
+ "str q2, [x21, #0xa0]\n"
+ "str q28, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q31, [x21, #0x0]\n"
+ "str q26, [x21, #0x10]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "str q24, [x21, #0x40]\n"
+ "str q23, [x21, #0x50]\n"
+ "str q22, [x21, #0x60]\n"
+ "str q21, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0xc\n"
+ "cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q23, [x9], #0x10\n"
- "ldr q19, [x26], #0x10\n"
- "sub x28, x28, #0xc\n"
- "ldr q22, [x25], #0x10\n"
- "ldr q18, [x24], #0x10\n"
- "cmp x28, #0xc\n"
- "ldr q21, [x23], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q20, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v31.8h, v23.8h, v19.8h\n"
- "zip2 v30.8h, v23.8h, v19.8h\n"
- "ldr d29, [x9], #0x8\n"
- "ldr d19, [x26], #0x8\n"
- "zip1 v28.8h, v22.8h, v18.8h\n"
- "zip2 v27.8h, v22.8h, v18.8h\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d18, [x24], #0x8\n"
- "zip1 v25.8h, v21.8h, v17.8h\n"
- "zip2 v24.8h, v21.8h, v17.8h\n"
- "ldr d23, [x23], #0x8\n"
+ "ldr q17, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
+ "sub x24, x24, #0xc\n"
+ "cmp x24, #0xc\n"
+ "ldr q19, [x27], #0x10\n"
+ "ldr q18, [x26], #0x10\n"
+ "zip1 v28.8h, v17.8h, v16.8h\n"
+ "zip2 v27.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v26.8h, v19.8h, v18.8h\n"
+ "zip2 v25.8h, v19.8h, v18.8h\n"
+ "ldr q19, [x22], #0x10\n"
+ "ldr q18, [x20], #0x10\n"
+ "zip1 v24.8h, v17.8h, v16.8h\n"
+ "zip2 v23.8h, v17.8h, v16.8h\n"
+ "ldr d17, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "zip1 v22.8h, v17.8h, v16.8h\n"
+ "zip1 v21.8h, v19.8h, v18.8h\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v20.8h, v17.8h, v16.8h\n"
+ "zip2 v19.8h, v19.8h, v18.8h\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
"ldr d17, [x22], #0x8\n"
- "zip1 v22.8h, v20.8h, v16.8h\n"
- "zip2 v21.8h, v20.8h, v16.8h\n"
- "ldr d20, [x21], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "zip1 v19.8h, v29.8h, v19.8h\n"
- "str q31, [x27, #0x0]\n"
- "zip1 v18.8h, v26.8h, v18.8h\n"
- "str q30, [x27, #0x10]\n"
- "zip1 v17.8h, v23.8h, v17.8h\n"
- "zip1 v16.8h, v20.8h, v16.8h\n"
- "str q19, [x27, #0x20]\n"
- "str q28, [x27, #0x30]\n"
- "str q27, [x27, #0x40]\n"
- "str q18, [x27, #0x50]\n"
- "str q25, [x27, #0x60]\n"
- "str q24, [x27, #0x70]\n"
- "str q17, [x27, #0x80]\n"
- "str q22, [x27, #0x90]\n"
- "str q21, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q28, [x21, #0x0]\n"
+ "str q27, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q26, [x21, #0x30]\n"
+ "str q25, [x21, #0x40]\n"
+ "str q20, [x21, #0x50]\n"
+ "str q24, [x21, #0x60]\n"
+ "str q23, [x21, #0x70]\n"
+ "str q18, [x21, #0x80]\n"
+ "str q21, [x21, #0x90]\n"
+ "str q19, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr d23, [x9], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "sub x28, x28, #0x4\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d16, [x24], #0x8\n"
- "cmp x28, #0x4\n"
- "ldr d21, [x23], #0x8\n"
+ "ldr d19, [x9], #0x8\n"
+ "ldr d18, [x28], #0x8\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v20.8h, v19.8h, v18.8h\n"
+ "zip1 v19.8h, v17.8h, v16.8h\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
"ldr d17, [x22], #0x8\n"
- "ldr d20, [x21], #0x8\n"
- "ldr d19, [x20], #0x8\n"
- "zip1 v18.8h, v23.8h, v18.8h\n"
- "zip1 v16.8h, v22.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v17.8h\n"
- "str q18, [x27, #0x0]\n"
- "str q16, [x27, #0x30]\n"
- "zip1 v16.8h, v20.8h, v19.8h\n"
- "str q17, [x27, #0x60]\n"
- "str q16, [x27, #0x90]\n"
- "add x27, x27, #0x10\n"
+ "ldr d16, [x20], #0x8\n"
+ "str q20, [x21, #0x0]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q19, [x21, #0x30]\n"
+ "str q18, [x21, #0x60]\n"
+ "str q16, [x21, #0x90]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr h23, [x9], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "sub x28, x28, #0x1\n"
- "ldr h22, [x25], #0x2\n"
- "ldr h16, [x24], #0x2\n"
- "cmp x28, #0x1\n"
- "ldr h21, [x23], #0x2\n"
+ "ldr h19, [x9], #0x2\n"
+ "ldr h18, [x28], #0x2\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h16, [x26], #0x2\n"
+ "zip1 v20.8h, v19.8h, v18.8h\n"
+ "zip1 v19.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x25], #0x2\n"
+ "ldr h16, [x23], #0x2\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
"ldr h17, [x22], #0x2\n"
- "ldr h20, [x21], #0x2\n"
- "ldr h19, [x20], #0x2\n"
- "zip1 v18.8h, v23.8h, v18.8h\n"
- "zip1 v16.8h, v22.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v17.8h\n"
- "str s18, [x27, #0x0]\n"
- "str s16, [x27, #0x30]\n"
- "zip1 v16.8h, v20.8h, v19.8h\n"
- "str s17, [x27, #0x60]\n"
- "str s16, [x27, #0x90]\n"
- "add x27, x27, #0x4\n"
+ "ldr h16, [x20], #0x2\n"
+ "str s20, [x21, #0x0]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str s19, [x21, #0x30]\n"
+ "str s18, [x21, #0x60]\n"
+ "str s16, [x21, #0x90]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0xc0\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"mov x20, %x[width]\n"
+ "add x28, x9, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x26, x9, %x[in_stride]\n"
- "add %x[in], x26, %x[in_stride]\n"
- "csel x26, x26, %x[pad_row], GT\n"
+ "add %x[in], x28, %x[in_stride]\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x18\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q19, [x9], #0x10\n"
- "ldr q16, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q17, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
"sub x20, x20, #0x18\n"
- "ldr q22, [x9], #0x10\n"
- "cmp x20, #0x18\n"
- "ldr q18, [x26], #0x10\n"
+ "zip1 v22.8h, v17.8h, v16.8h\n"
"ldr q21, [x9], #0x10\n"
- "ldr q20, [x26], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v17.8h, v17.8h, v16.8h\n"
+ "zip1 v20.8h, v21.8h, v18.8h\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
+ "str q22, [x21, #0x0]\n"
+ "cmp x20, #0x18\n"
+ "str q17, [x21, #0x10]\n"
+ "zip2 v18.8h, v21.8h, v18.8h\n"
"zip1 v17.8h, v19.8h, v16.8h\n"
+ "str q20, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
"zip2 v16.8h, v19.8h, v16.8h\n"
- "zip1 v19.8h, v22.8h, v18.8h\n"
- "zip2 v18.8h, v22.8h, v18.8h\n"
- "str q17, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
- "zip2 v16.8h, v21.8h, v20.8h\n"
- "str q19, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q18, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr q20, [x9], #0x10\n"
- "ldr q17, [x26], #0x10\n"
+ "ldr q17, [x28], #0x10\n"
"sub x20, x20, #0xc\n"
- "ldr d19, [x9], #0x8\n"
"cmp x20, #0xc\n"
- "ldr d16, [x26], #0x8\n"
+ "ldr d19, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
"zip1 v18.8h, v20.8h, v17.8h\n"
"zip2 v17.8h, v20.8h, v17.8h\n"
"zip1 v16.8h, v19.8h, v16.8h\n"
- "str q18, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d17, [x9], #0x8\n"
- "ldr d16, [x26], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h17, [x9], #0x2\n"
- "ldr h16, [x26], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str s16, [x27, #0x0]\n"
- "add x27, x27, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x30\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
index b9dca66e7b..67493393a0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
@@ -40,412 +40,388 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x18\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q27, [x9], #0x10\n"
- "ldr q26, [x26], #0x10\n"
- "sub x28, x28, #0x18\n"
- "ldr q25, [x25], #0x10\n"
- "ldr q24, [x24], #0x10\n"
- "cmp x28, #0x18\n"
- "ldr q23, [x23], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
- "ldr q20, [x20], #0x10\n"
- "ldr q19, [x9], #0x10\n"
- "ldr q18, [x26], #0x10\n"
- "zip1 v15.8h, v27.8h, v25.8h\n"
- "zip1 v14.8h, v26.8h, v24.8h\n"
- "ldr q17, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "zip2 v13.8h, v27.8h, v25.8h\n"
- "zip2 v12.8h, v26.8h, v24.8h\n"
- "ldr q1, [x23], #0x10\n"
- "ldr q0, [x22], #0x10\n"
- "zip1 v11.8h, v23.8h, v21.8h\n"
- "zip1 v10.8h, v22.8h, v20.8h\n"
- "ldr q31, [x21], #0x10\n"
- "ldr q24, [x20], #0x10\n"
- "zip2 v30.8h, v23.8h, v21.8h\n"
- "zip2 v29.8h, v22.8h, v20.8h\n"
- "ldr q23, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "zip1 v28.8h, v19.8h, v17.8h\n"
- "zip1 v27.8h, v18.8h, v16.8h\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip2 v9.8h, v19.8h, v17.8h\n"
- "zip2 v8.8h, v18.8h, v16.8h\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v26.8h, v1.8h, v31.8h\n"
- "zip1 v25.8h, v0.8h, v24.8h\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "sub x24, x24, #0x18\n"
+ "cmp x24, #0x18\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v13.8h, v21.8h, v17.8h\n"
+ "zip1 v12.8h, v20.8h, v16.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v11.8h, v21.8h, v17.8h\n"
+ "zip2 v10.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v9.8h, v19.8h, v17.8h\n"
+ "zip1 v8.8h, v18.8h, v16.8h\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v7.8h, v19.8h, v17.8h\n"
+ "zip2 v6.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v27.8h, v21.8h, v17.8h\n"
+ "zip1 v22.8h, v20.8h, v16.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v5.8h, v21.8h, v17.8h\n"
+ "zip2 v4.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v26.8h, v19.8h, v17.8h\n"
+ "zip1 v25.8h, v18.8h, v16.8h\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v3.8h, v19.8h, v17.8h\n"
+ "zip2 v2.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v24.8h, v21.8h, v17.8h\n"
+ "zip1 v23.8h, v20.8h, v16.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v1.8h, v21.8h, v17.8h\n"
+ "zip2 v0.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip2 v7.8h, v1.8h, v31.8h\n"
- "zip2 v6.8h, v0.8h, v24.8h\n"
- "zip1 v5.8h, v23.8h, v21.8h\n"
- "zip1 v24.8h, v22.8h, v20.8h\n"
- "zip2 v4.8h, v23.8h, v21.8h\n"
- "zip2 v3.8h, v22.8h, v20.8h\n"
- "zip1 v2.8h, v19.8h, v17.8h\n"
- "zip1 v1.8h, v18.8h, v16.8h\n"
- "zip2 v0.8h, v19.8h, v17.8h\n"
- "zip2 v31.8h, v18.8h, v16.8h\n"
- "zip1 v16.8h, v15.8h, v14.8h\n"
- "zip2 v23.8h, v15.8h, v14.8h\n"
- "zip1 v22.8h, v13.8h, v12.8h\n"
- "zip2 v21.8h, v13.8h, v12.8h\n"
- "zip1 v20.8h, v28.8h, v27.8h\n"
- "zip2 v19.8h, v28.8h, v27.8h\n"
+ "zip1 v31.8h, v19.8h, v17.8h\n"
+ "zip1 v30.8h, v18.8h, v16.8h\n"
+ "zip2 v29.8h, v19.8h, v17.8h\n"
+ "zip2 v28.8h, v18.8h, v16.8h\n"
+ "zip1 v17.8h, v13.8h, v12.8h\n"
+ "zip2 v16.8h, v13.8h, v12.8h\n"
+ "str q17, [x21, #0x0]\n"
"zip1 v18.8h, v11.8h, v10.8h\n"
"zip2 v17.8h, v11.8h, v10.8h\n"
- "str q16, [x27, #0x0]\n"
- "zip1 v16.8h, v30.8h, v29.8h\n"
- "zip2 v30.8h, v30.8h, v29.8h\n"
- "str q23, [x27, #0x10]\n"
- "zip1 v29.8h, v26.8h, v25.8h\n"
- "zip2 v28.8h, v26.8h, v25.8h\n"
- "str q22, [x27, #0x20]\n"
- "str q21, [x27, #0x30]\n"
- "zip1 v27.8h, v9.8h, v8.8h\n"
- "zip2 v26.8h, v9.8h, v8.8h\n"
- "str q20, [x27, #0x40]\n"
- "zip1 v25.8h, v5.8h, v24.8h\n"
- "zip2 v24.8h, v5.8h, v24.8h\n"
- "str q19, [x27, #0x50]\n"
- "zip1 v23.8h, v4.8h, v3.8h\n"
- "zip2 v22.8h, v4.8h, v3.8h\n"
- "str q18, [x27, #0x60]\n"
- "zip1 v21.8h, v7.8h, v6.8h\n"
- "zip2 v20.8h, v7.8h, v6.8h\n"
- "str q17, [x27, #0x70]\n"
- "zip1 v19.8h, v2.8h, v1.8h\n"
- "zip2 v18.8h, v2.8h, v1.8h\n"
- "str q16, [x27, #0x80]\n"
- "zip1 v17.8h, v0.8h, v31.8h\n"
- "zip2 v16.8h, v0.8h, v31.8h\n"
- "str q30, [x27, #0x90]\n"
- "str q29, [x27, #0xa0]\n"
- "str q28, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q27, [x27, #0x0]\n"
- "str q26, [x27, #0x10]\n"
- "str q25, [x27, #0x20]\n"
- "str q24, [x27, #0x30]\n"
- "str q23, [x27, #0x40]\n"
- "str q22, [x27, #0x50]\n"
- "str q21, [x27, #0x60]\n"
- "str q20, [x27, #0x70]\n"
- "str q19, [x27, #0x80]\n"
- "str q18, [x27, #0x90]\n"
- "str q17, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q16, [x21, #0x10]\n"
+ "zip1 v16.8h, v27.8h, v22.8h\n"
+ "zip2 v22.8h, v27.8h, v22.8h\n"
+ "str q18, [x21, #0x20]\n"
+ "zip1 v21.8h, v9.8h, v8.8h\n"
+ "zip2 v20.8h, v9.8h, v8.8h\n"
+ "str q17, [x21, #0x30]\n"
+ "zip1 v19.8h, v7.8h, v6.8h\n"
+ "zip2 v18.8h, v7.8h, v6.8h\n"
+ "str q16, [x21, #0x40]\n"
+ "zip1 v17.8h, v26.8h, v25.8h\n"
+ "zip2 v16.8h, v26.8h, v25.8h\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "zip1 v27.8h, v5.8h, v4.8h\n"
+ "zip2 v26.8h, v5.8h, v4.8h\n"
+ "str q20, [x21, #0x70]\n"
+ "zip1 v25.8h, v24.8h, v23.8h\n"
+ "zip2 v24.8h, v24.8h, v23.8h\n"
+ "str q19, [x21, #0x80]\n"
+ "zip1 v23.8h, v1.8h, v0.8h\n"
+ "zip2 v22.8h, v1.8h, v0.8h\n"
+ "str q18, [x21, #0x90]\n"
+ "zip1 v21.8h, v3.8h, v2.8h\n"
+ "zip2 v20.8h, v3.8h, v2.8h\n"
+ "str q17, [x21, #0xa0]\n"
+ "zip1 v19.8h, v31.8h, v30.8h\n"
+ "zip2 v18.8h, v31.8h, v30.8h\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip1 v17.8h, v29.8h, v28.8h\n"
+ "zip2 v16.8h, v29.8h, v28.8h\n"
+ "str q27, [x21, #0x0]\n"
+ "str q26, [x21, #0x10]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q24, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q20, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0xc\n"
+ "cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q23, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "sub x28, x28, #0xc\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "sub x24, x24, #0xc\n"
+ "cmp x24, #0xc\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v25.8h, v19.8h, v17.8h\n"
+ "zip1 v24.8h, v18.8h, v16.8h\n"
"ldr q21, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "cmp x28, #0xc\n"
- "ldr q1, [x23], #0x10\n"
- "ldr q0, [x22], #0x10\n"
- "ldr q28, [x21], #0x10\n"
- "ldr q27, [x20], #0x10\n"
- "ldr d20, [x9], #0x8\n"
- "ldr d19, [x26], #0x8\n"
- "zip1 v26.8h, v23.8h, v21.8h\n"
- "zip1 v25.8h, v22.8h, v16.8h\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d17, [x24], #0x8\n"
- "zip2 v31.8h, v23.8h, v21.8h\n"
- "zip2 v24.8h, v22.8h, v16.8h\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
- "zip1 v30.8h, v1.8h, v28.8h\n"
- "zip1 v29.8h, v0.8h, v27.8h\n"
- "ldr d21, [x21], #0x8\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip2 v31.8h, v19.8h, v17.8h\n"
+ "zip2 v23.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v30.8h, v21.8h, v17.8h\n"
+ "zip1 v29.8h, v20.8h, v16.8h\n"
+ "ldr d19, [x9], #0x8\n"
+ "ldr d18, [x28], #0x8\n"
+ "zip2 v28.8h, v21.8h, v17.8h\n"
+ "zip2 v27.8h, v20.8h, v16.8h\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v26.8h, v19.8h, v17.8h\n"
+ "zip1 v22.8h, v18.8h, v16.8h\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "zip1 v19.8h, v25.8h, v24.8h\n"
+ "zip2 v18.8h, v25.8h, v24.8h\n"
+ "ldr d17, [x22], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "zip2 v28.8h, v1.8h, v28.8h\n"
- "zip2 v27.8h, v0.8h, v27.8h\n"
- "zip1 v20.8h, v20.8h, v18.8h\n"
- "zip1 v19.8h, v19.8h, v17.8h\n"
- "zip1 v18.8h, v26.8h, v25.8h\n"
- "zip2 v17.8h, v26.8h, v25.8h\n"
- "zip1 v26.8h, v23.8h, v21.8h\n"
- "zip1 v25.8h, v22.8h, v16.8h\n"
- "zip1 v16.8h, v31.8h, v24.8h\n"
- "zip2 v24.8h, v31.8h, v24.8h\n"
- "zip1 v23.8h, v20.8h, v19.8h\n"
- "zip2 v22.8h, v20.8h, v19.8h\n"
- "str q18, [x27, #0x0]\n"
+ "zip1 v25.8h, v21.8h, v17.8h\n"
+ "zip1 v24.8h, v20.8h, v16.8h\n"
+ "zip1 v17.8h, v31.8h, v23.8h\n"
+ "zip2 v16.8h, v31.8h, v23.8h\n"
+ "str q19, [x21, #0x0]\n"
+ "zip1 v23.8h, v26.8h, v22.8h\n"
+ "zip2 v22.8h, v26.8h, v22.8h\n"
+ "str q18, [x21, #0x10]\n"
"zip1 v21.8h, v30.8h, v29.8h\n"
"zip2 v20.8h, v30.8h, v29.8h\n"
- "str q17, [x27, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
"zip1 v19.8h, v28.8h, v27.8h\n"
"zip2 v18.8h, v28.8h, v27.8h\n"
- "str q16, [x27, #0x20]\n"
- "zip1 v17.8h, v26.8h, v25.8h\n"
- "zip2 v16.8h, v26.8h, v25.8h\n"
- "str q24, [x27, #0x30]\n"
- "str q23, [x27, #0x40]\n"
- "str q22, [x27, #0x50]\n"
- "str q21, [x27, #0x60]\n"
- "str q20, [x27, #0x70]\n"
- "str q19, [x27, #0x80]\n"
- "str q18, [x27, #0x90]\n"
- "str q17, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q16, [x21, #0x30]\n"
+ "zip1 v17.8h, v25.8h, v24.8h\n"
+ "zip2 v16.8h, v25.8h, v24.8h\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q20, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr d23, [x9], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "sub x28, x28, #0x4\n"
+ "ldr d19, [x9], #0x8\n"
+ "ldr d18, [x28], #0x8\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v17.8h, v19.8h, v17.8h\n"
+ "zip1 v16.8h, v18.8h, v16.8h\n"
"ldr d18, [x25], #0x8\n"
- "ldr d17, [x24], #0x8\n"
- "cmp x28, #0x4\n"
- "ldr d20, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "zip1 v20.8h, v17.8h, v16.8h\n"
+ "zip2 v19.8h, v17.8h, v16.8h\n"
+ "ldr d17, [x22], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "zip1 v18.8h, v23.8h, v18.8h\n"
- "zip1 v17.8h, v22.8h, v17.8h\n"
- "zip1 v20.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v18.8h, v17.8h\n"
"zip1 v16.8h, v21.8h, v16.8h\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "zip1 v17.8h, v20.8h, v16.8h\n"
- "zip2 v16.8h, v20.8h, v16.8h\n"
- "str q19, [x27, #0x0]\n"
- "str q18, [x27, #0x10]\n"
- "str q17, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "add x27, x27, #0x20\n"
+ "str q20, [x21, #0x0]\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q19, [x21, #0x10]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, #0x20\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr h23, [x9], #0x2\n"
- "ldr h22, [x26], #0x2\n"
- "sub x28, x28, #0x1\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h17, [x24], #0x2\n"
- "cmp x28, #0x1\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h20, [x22], #0x2\n"
- "ldr h18, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
- "zip1 v19.8h, v23.8h, v19.8h\n"
- "zip1 v17.8h, v22.8h, v17.8h\n"
- "zip1 v18.8h, v21.8h, v18.8h\n"
- "zip1 v16.8h, v20.8h, v16.8h\n"
+ "ldr h19, [x9], #0x2\n"
+ "ldr h18, [x28], #0x2\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h16, [x26], #0x2\n"
"zip1 v17.8h, v19.8h, v17.8h\n"
"zip1 v16.8h, v18.8h, v16.8h\n"
- "str d17, [x27, #0x0]\n"
- "str d16, [x27, #0x60]\n"
- "add x27, x27, #0x8\n"
+ "ldr h20, [x25], #0x2\n"
+ "ldr h19, [x23], #0x2\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "zip1 v17.8h, v20.8h, v17.8h\n"
+ "zip1 v16.8h, v19.8h, v16.8h\n"
+ "str d18, [x21, #0x0]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str d16, [x21, #0x60]\n"
+ "add x21, x21, #0x8\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0xc0\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x9, %x[in]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x26, x27, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "csel x25, x25, %x[pad_row], GE\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x26, %x[in_stride]\n"
"csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x18\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q23, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
"sub x20, x20, #0x18\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
"cmp x20, #0x18\n"
- "ldr q26, [x9], #0x10\n"
- "ldr q25, [x26], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v31.8h, v19.8h, v17.8h\n"
+ "zip1 v30.8h, v18.8h, v16.8h\n"
+ "ldr q22, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v29.8h, v19.8h, v17.8h\n"
+ "zip2 v28.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v27.8h, v22.8h, v17.8h\n"
+ "zip1 v21.8h, v20.8h, v16.8h\n"
"ldr q19, [x9], #0x10\n"
- "zip1 v31.8h, v23.8h, v21.8h\n"
- "zip1 v30.8h, v22.8h, v16.8h\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "zip2 v29.8h, v23.8h, v21.8h\n"
- "zip2 v23.8h, v22.8h, v16.8h\n"
- "ldr q16, [x24], #0x10\n"
- "zip1 v22.8h, v26.8h, v24.8h\n"
- "zip1 v21.8h, v25.8h, v20.8h\n"
- "zip2 v28.8h, v26.8h, v24.8h\n"
- "zip2 v20.8h, v25.8h, v20.8h\n"
- "zip1 v27.8h, v19.8h, v17.8h\n"
- "zip1 v26.8h, v18.8h, v16.8h\n"
- "zip2 v25.8h, v19.8h, v17.8h\n"
- "zip2 v24.8h, v18.8h, v16.8h\n"
- "zip1 v19.8h, v31.8h, v30.8h\n"
- "zip2 v18.8h, v31.8h, v30.8h\n"
- "zip1 v17.8h, v29.8h, v23.8h\n"
- "zip2 v16.8h, v29.8h, v23.8h\n"
- "zip1 v23.8h, v22.8h, v21.8h\n"
- "zip2 v22.8h, v22.8h, v21.8h\n"
- "str q19, [x27, #0x0]\n"
- "str q18, [x27, #0x10]\n"
- "zip1 v21.8h, v28.8h, v20.8h\n"
- "zip2 v20.8h, v28.8h, v20.8h\n"
- "str q17, [x27, #0x20]\n"
- "zip1 v19.8h, v27.8h, v26.8h\n"
- "zip2 v18.8h, v27.8h, v26.8h\n"
- "str q16, [x27, #0x30]\n"
- "zip1 v17.8h, v25.8h, v24.8h\n"
- "zip2 v16.8h, v25.8h, v24.8h\n"
- "str q23, [x27, #0x40]\n"
- "str q22, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "str q18, [x27, #0x30]\n"
- "str q17, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v26.8h, v22.8h, v17.8h\n"
+ "zip2 v20.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v25.8h, v19.8h, v17.8h\n"
+ "zip1 v24.8h, v18.8h, v16.8h\n"
+ "zip2 v23.8h, v19.8h, v17.8h\n"
+ "zip2 v22.8h, v18.8h, v16.8h\n"
+ "zip1 v17.8h, v31.8h, v30.8h\n"
+ "zip2 v16.8h, v31.8h, v30.8h\n"
+ "str q17, [x21, #0x0]\n"
+ "zip1 v19.8h, v29.8h, v28.8h\n"
+ "zip2 v18.8h, v29.8h, v28.8h\n"
+ "str q16, [x21, #0x10]\n"
+ "zip1 v17.8h, v27.8h, v21.8h\n"
+ "zip2 v16.8h, v27.8h, v21.8h\n"
+ "str q19, [x21, #0x20]\n"
+ "str q18, [x21, #0x30]\n"
+ "zip1 v21.8h, v26.8h, v20.8h\n"
+ "zip2 v20.8h, v26.8h, v20.8h\n"
+ "str q17, [x21, #0x40]\n"
+ "zip1 v19.8h, v25.8h, v24.8h\n"
+ "zip2 v18.8h, v25.8h, v24.8h\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip1 v17.8h, v23.8h, v22.8h\n"
+ "zip2 v16.8h, v23.8h, v22.8h\n"
+ "str q21, [x21, #0x0]\n"
+ "str q20, [x21, #0x10]\n"
+ "str q19, [x21, #0x20]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
- "ldr q25, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
+ "ldr q21, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
"sub x20, x20, #0xc\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0xc\n"
- "ldr d22, [x9], #0x8\n"
- "ldr d23, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d16, [x24], #0x8\n"
- "zip1 v20.8h, v25.8h, v18.8h\n"
- "zip1 v19.8h, v24.8h, v17.8h\n"
- "zip2 v18.8h, v25.8h, v18.8h\n"
- "zip2 v17.8h, v24.8h, v17.8h\n"
- "zip1 v22.8h, v22.8h, v21.8h\n"
- "zip1 v16.8h, v23.8h, v16.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "zip1 v17.8h, v22.8h, v16.8h\n"
- "zip2 v16.8h, v22.8h, v16.8h\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x20]\n"
- "str q18, [x27, #0x30]\n"
- "str q17, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v24.8h, v21.8h, v17.8h\n"
+ "zip1 v23.8h, v18.8h, v16.8h\n"
+ "ldr d20, [x9], #0x8\n"
+ "ldr d19, [x28], #0x8\n"
+ "zip2 v22.8h, v21.8h, v17.8h\n"
+ "zip2 v18.8h, v18.8h, v16.8h\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v21.8h, v20.8h, v17.8h\n"
+ "zip1 v20.8h, v19.8h, v16.8h\n"
+ "zip1 v17.8h, v24.8h, v23.8h\n"
+ "zip2 v16.8h, v24.8h, v23.8h\n"
+ "str q17, [x21, #0x0]\n"
+ "zip1 v19.8h, v22.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v18.8h\n"
+ "str q16, [x21, #0x10]\n"
+ "zip1 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v16.8h, v21.8h, v20.8h\n"
+ "str q19, [x21, #0x20]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d18, [x9], #0x8\n"
- "ldr d19, [x26], #0x8\n"
+ "ldr d19, [x28], #0x8\n"
"sub x20, x20, #0x4\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d16, [x24], #0x8\n"
"cmp x20, #0x4\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
"zip1 v18.8h, v18.8h, v17.8h\n"
"zip1 v16.8h, v19.8h, v16.8h\n"
"zip1 v17.8h, v18.8h, v16.8h\n"
"zip2 v16.8h, v18.8h, v16.8h\n"
- "str q17, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q17, [x21, #0x0]\n"
+ "str q16, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h19, [x9], #0x2\n"
- "ldr h18, [x26], #0x2\n"
+ "ldr h18, [x28], #0x2\n"
"sub x20, x20, #0x1\n"
- "ldr h17, [x25], #0x2\n"
- "ldr h16, [x24], #0x2\n"
"cmp x20, #0x1\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h16, [x26], #0x2\n"
"zip1 v17.8h, v19.8h, v17.8h\n"
"zip1 v16.8h, v18.8h, v16.8h\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x60\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
index 07326e7c98..fe554a65f9 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
@@ -40,607 +40,592 @@ void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, si
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x18\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q18, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "sub x28, x28, #0x18\n"
- "ldr q23, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "cmp x28, #0x18\n"
- "ldr q29, [x23], #0x10\n"
- "ldr q7, [x22], #0x10\n"
- "ldr q5, [x21], #0x10\n"
- "ldr q12, [x20], #0x10\n"
- "ldr q1, [x9], #0x10\n"
- "ldr q13, [x26], #0x10\n"
- "zip1 v6.4s, v18.4s, v23.4s\n"
- "zip1 v26.4s, v24.4s, v20.4s\n"
- "ldr q9, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
- "zip2 v23.4s, v18.4s, v23.4s\n"
- "zip2 v19.4s, v24.4s, v20.4s\n"
- "ldr q22, [x23], #0x10\n"
- "ldr q27, [x22], #0x10\n"
- "zip1 v15.4s, v29.4s, v5.4s\n"
- "zip1 v2.4s, v7.4s, v12.4s\n"
- "ldr q25, [x21], #0x10\n"
- "ldr q28, [x20], #0x10\n"
- "zip2 v21.4s, v29.4s, v5.4s\n"
- "zip2 v11.4s, v7.4s, v12.4s\n"
- "ldr q3, [x9], #0x10\n"
- "ldr q31, [x26], #0x10\n"
- "zip1 v12.4s, v1.4s, v9.4s\n"
- "zip1 v14.4s, v13.4s, v17.4s\n"
- "ldr q30, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "zip2 v1.4s, v1.4s, v9.4s\n"
- "zip2 v0.4s, v13.4s, v17.4s\n"
- "ldr q13, [x23], #0x10\n"
- "ldr q8, [x22], #0x10\n"
- "zip1 v9.4s, v22.4s, v25.4s\n"
- "zip1 v29.4s, v27.4s, v28.4s\n"
- "ldr q5, [x21], #0x10\n"
- "ldr q17, [x20], #0x10\n"
- "zip2 v20.4s, v22.4s, v25.4s\n"
- "zip2 v28.4s, v27.4s, v28.4s\n"
- "ldr q7, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "zip1 v25.4s, v3.4s, v30.4s\n"
- "zip1 v10.4s, v31.4s, v16.4s\n"
- "ldr q4, [x25], #0x10\n"
- "ldr q22, [x24], #0x10\n"
- "zip2 v27.4s, v3.4s, v30.4s\n"
- "zip2 v3.4s, v31.4s, v16.4s\n"
- "ldr q31, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v16.4s, v13.4s, v5.4s\n"
- "zip1 v30.4s, v8.4s, v17.4s\n"
- "zip2 v13.4s, v13.4s, v5.4s\n"
- "ldr q5, [x21], #0x10\n"
- "zip2 v17.4s, v8.4s, v17.4s\n"
- "zip1 v8.4s, v7.4s, v4.4s\n"
- "zip2 v7.4s, v7.4s, v4.4s\n"
- "zip1 v4.4s, v24.4s, v22.4s\n"
- "zip2 v22.4s, v24.4s, v22.4s\n"
- "zip1 v24.4s, v31.4s, v5.4s\n"
- "zip2 v31.4s, v31.4s, v5.4s\n"
- "zip1 v5.4s, v6.4s, v26.4s\n"
- "zip2 v6.4s, v6.4s, v26.4s\n"
+ "ldr q15, [x9], #0x10\n"
+ "ldr q17, [x28], #0x10\n"
+ "sub x24, x24, #0x18\n"
+ "cmp x24, #0x18\n"
+ "ldr q16, [x27], #0x10\n"
+ "ldr q20, [x26], #0x10\n"
+ "zip1 v6.4s, v15.4s, v16.4s\n"
+ "zip1 v11.4s, v17.4s, v20.4s\n"
+ "ldr q2, [x25], #0x10\n"
+ "ldr q4, [x23], #0x10\n"
+ "zip2 v22.4s, v15.4s, v16.4s\n"
+ "zip2 v18.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q26, [x20], #0x10\n"
- ".inst 0x0ea168a5 // bfcvtn v5.4h, v5.4s\n"
- ".inst 0x4ea168c5 // bfcvtn2 v5.8h, v6.4s\n"
- "zip1 v6.4s, v23.4s, v19.4s\n"
- "zip2 v19.4s, v23.4s, v19.4s\n"
- "ldr q23, [x9], #0x10\n"
- ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
- ".inst 0x4ea16a66 // bfcvtn2 v6.8h, v19.4s\n"
- "zip1 v19.4s, v18.4s, v26.4s\n"
- "zip2 v26.4s, v18.4s, v26.4s\n"
- "zip1 v18.4s, v12.4s, v14.4s\n"
- "zip2 v12.4s, v12.4s, v14.4s\n"
- "ldr q14, [x26], #0x10\n"
- ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
- ".inst 0x4ea16992 // bfcvtn2 v18.8h, v12.4s\n"
- "zip1 v12.4s, v1.4s, v0.4s\n"
- "zip2 v1.4s, v1.4s, v0.4s\n"
- "ldr q0, [x25], #0x10\n"
- ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n"
- ".inst 0x4ea1682c // bfcvtn2 v12.8h, v1.4s\n"
- "zip1 v1.4s, v25.4s, v10.4s\n"
- "zip2 v10.4s, v25.4s, v10.4s\n"
- "ldr q25, [x24], #0x10\n"
+ "zip1 v9.4s, v2.4s, v17.4s\n"
+ "zip1 v10.4s, v4.4s, v26.4s\n"
+ "ldr q16, [x9], #0x10\n"
+ "ldr q27, [x28], #0x10\n"
+ "zip2 v3.4s, v2.4s, v17.4s\n"
+ "zip2 v30.4s, v4.4s, v26.4s\n"
+ "ldr q13, [x27], #0x10\n"
+ "ldr q1, [x26], #0x10\n"
+ "zip1 v23.4s, v16.4s, v13.4s\n"
+ "zip1 v5.4s, v27.4s, v1.4s\n"
+ "ldr q26, [x25], #0x10\n"
+ "ldr q14, [x23], #0x10\n"
+ "zip2 v0.4s, v16.4s, v13.4s\n"
+ "zip2 v2.4s, v27.4s, v1.4s\n"
+ "ldr q15, [x22], #0x10\n"
+ "ldr q8, [x20], #0x10\n"
+ "zip1 v31.4s, v26.4s, v15.4s\n"
+ "zip1 v4.4s, v14.4s, v8.4s\n"
+ "ldr q28, [x9], #0x10\n"
+ "ldr q19, [x28], #0x10\n"
+ "zip2 v21.4s, v26.4s, v15.4s\n"
+ "zip2 v16.4s, v14.4s, v8.4s\n"
+ "ldr q15, [x27], #0x10\n"
+ "ldr q1, [x26], #0x10\n"
+ "zip1 v17.4s, v28.4s, v15.4s\n"
+ "zip1 v8.4s, v19.4s, v1.4s\n"
+ "ldr q27, [x25], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip2 v7.4s, v28.4s, v15.4s\n"
+ "zip2 v15.4s, v19.4s, v1.4s\n"
+ "ldr q12, [x22], #0x10\n"
+ "ldr q25, [x20], #0x10\n"
+ "zip1 v14.4s, v27.4s, v12.4s\n"
+ "zip1 v26.4s, v20.4s, v25.4s\n"
+ "ldr q13, [x9], #0x10\n"
+ "ldr q29, [x28], #0x10\n"
+ "zip2 v28.4s, v27.4s, v12.4s\n"
+ "zip2 v12.4s, v20.4s, v25.4s\n"
+ "ldr q27, [x27], #0x10\n"
+ "ldr q20, [x26], #0x10\n"
+ "zip1 v19.4s, v13.4s, v27.4s\n"
+ "zip1 v25.4s, v29.4s, v20.4s\n"
+ "ldr q24, [x25], #0x10\n"
+ "ldr q1, [x23], #0x10\n"
+ "zip2 v27.4s, v13.4s, v27.4s\n"
+ "zip2 v13.4s, v29.4s, v20.4s\n"
+ "ldr q20, [x22], #0x10\n"
+ "zip1 v29.4s, v24.4s, v20.4s\n"
+ "zip2 v20.4s, v24.4s, v20.4s\n"
+ "zip1 v24.4s, v6.4s, v11.4s\n"
+ ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n"
+ "zip2 v11.4s, v6.4s, v11.4s\n"
+ "ldr q6, [x20], #0x10\n"
+ ".inst 0x4ea16978 // bfcvtn2 v24.8h, v11.4s\n"
+ "zip1 v11.4s, v1.4s, v6.4s\n"
+ "zip2 v6.4s, v1.4s, v6.4s\n"
+ "zip1 v1.4s, v22.4s, v18.4s\n"
".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
- ".inst 0x4ea16941 // bfcvtn2 v1.8h, v10.4s\n"
- "zip1 v10.4s, v23.4s, v0.4s\n"
- "zip2 v23.4s, v23.4s, v0.4s\n"
- "zip1 v0.4s, v14.4s, v25.4s\n"
- "zip2 v25.4s, v14.4s, v25.4s\n"
- "zip1 v14.4s, v27.4s, v3.4s\n"
- "zip2 v3.4s, v27.4s, v3.4s\n"
- "ldr q27, [x23], #0x10\n"
- ".inst 0x0ea169ce // bfcvtn v14.4h, v14.4s\n"
- ".inst 0x4ea1686e // bfcvtn2 v14.8h, v3.4s\n"
- "zip1 v3.4s, v15.4s, v2.4s\n"
- "zip2 v15.4s, v15.4s, v2.4s\n"
- "ldr q2, [x22], #0x10\n"
+ "zip2 v18.4s, v22.4s, v18.4s\n"
+ "ldr q22, [x9], #0x10\n"
+ ".inst 0x4ea16a41 // bfcvtn2 v1.8h, v18.4s\n"
+ "zip1 v18.4s, v23.4s, v5.4s\n"
+ ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
+ "zip2 v5.4s, v23.4s, v5.4s\n"
+ "ldr q23, [x28], #0x10\n"
+ ".inst 0x4ea168b2 // bfcvtn2 v18.8h, v5.4s\n"
+ "zip1 v5.4s, v0.4s, v2.4s\n"
+ ".inst 0x0ea168a5 // bfcvtn v5.4h, v5.4s\n"
+ "zip2 v0.4s, v0.4s, v2.4s\n"
+ "ldr q2, [x27], #0x10\n"
+ ".inst 0x4ea16805 // bfcvtn2 v5.8h, v0.4s\n"
+ "zip1 v0.4s, v22.4s, v2.4s\n"
+ "zip2 v2.4s, v22.4s, v2.4s\n"
+ "zip1 v22.4s, v17.4s, v8.4s\n"
+ ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n"
+ "zip2 v8.4s, v17.4s, v8.4s\n"
+ "ldr q17, [x26], #0x10\n"
+ ".inst 0x4ea16916 // bfcvtn2 v22.8h, v8.4s\n"
+ "zip1 v8.4s, v23.4s, v17.4s\n"
+ "zip2 v23.4s, v23.4s, v17.4s\n"
+ "zip1 v17.4s, v7.4s, v15.4s\n"
+ ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ "zip2 v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x25], #0x10\n"
+ ".inst 0x4ea168f1 // bfcvtn2 v17.8h, v7.4s\n"
+ "zip1 v7.4s, v9.4s, v10.4s\n"
+ ".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n"
+ "zip2 v10.4s, v9.4s, v10.4s\n"
+ "ldr q9, [x23], #0x10\n"
+ ".inst 0x4ea16947 // bfcvtn2 v7.8h, v10.4s\n"
+ "zip1 v10.4s, v3.4s, v30.4s\n"
+ ".inst 0x0ea1694a // bfcvtn v10.4h, v10.4s\n"
+ "zip2 v30.4s, v3.4s, v30.4s\n"
+ "ldr q3, [x22], #0x10\n"
+ ".inst 0x4ea16bca // bfcvtn2 v10.8h, v30.4s\n"
+ "zip1 v30.4s, v15.4s, v3.4s\n"
+ "zip2 v15.4s, v15.4s, v3.4s\n"
+ "zip1 v3.4s, v31.4s, v4.4s\n"
".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
- ".inst 0x4ea169e3 // bfcvtn2 v3.8h, v15.4s\n"
- "zip1 v15.4s, v21.4s, v11.4s\n"
- "zip2 v11.4s, v21.4s, v11.4s\n"
- "ldr q21, [x21], #0x10\n"
- ".inst 0x0ea169ef // bfcvtn v15.4h, v15.4s\n"
- ".inst 0x4ea1696f // bfcvtn2 v15.8h, v11.4s\n"
- "zip1 v11.4s, v9.4s, v29.4s\n"
- "zip2 v29.4s, v9.4s, v29.4s\n"
- "ldr q9, [x20], #0x10\n"
- ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
- ".inst 0x4ea16bab // bfcvtn2 v11.8h, v29.4s\n"
- "zip1 v29.4s, v27.4s, v21.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "zip1 v27.4s, v2.4s, v9.4s\n"
- "zip2 v2.4s, v2.4s, v9.4s\n"
- "zip1 v9.4s, v20.4s, v28.4s\n"
- "zip2 v28.4s, v20.4s, v28.4s\n"
- "ldr q20, [x9], #0x10\n"
+ "zip2 v31.4s, v31.4s, v4.4s\n"
+ "ldr q4, [x20], #0x10\n"
+ ".inst 0x4ea16be3 // bfcvtn2 v3.8h, v31.4s\n"
+ "zip1 v31.4s, v9.4s, v4.4s\n"
+ "zip2 v4.4s, v9.4s, v4.4s\n"
+ "zip1 v9.4s, v21.4s, v16.4s\n"
".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n"
- ".inst 0x4ea16b89 // bfcvtn2 v9.8h, v28.4s\n"
- "zip1 v28.4s, v16.4s, v30.4s\n"
- "zip2 v30.4s, v16.4s, v30.4s\n"
- "ldr q16, [x26], #0x10\n"
- ".inst 0x0ea16b9c // bfcvtn v28.4h, v28.4s\n"
- ".inst 0x4ea16bdc // bfcvtn2 v28.8h, v30.4s\n"
- "zip1 v30.4s, v13.4s, v17.4s\n"
- "zip2 v13.4s, v13.4s, v17.4s\n"
- "ldr q17, [x25], #0x10\n"
- ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
- ".inst 0x4ea169be // bfcvtn2 v30.8h, v13.4s\n"
- "zip1 v13.4s, v8.4s, v4.4s\n"
- "zip2 v4.4s, v8.4s, v4.4s\n"
- "ldr q8, [x24], #0x10\n"
- ".inst 0x0ea169ad // bfcvtn v13.4h, v13.4s\n"
- ".inst 0x4ea1688d // bfcvtn2 v13.8h, v4.4s\n"
- "zip1 v4.4s, v20.4s, v17.4s\n"
- "zip2 v20.4s, v20.4s, v17.4s\n"
- "zip1 v17.4s, v16.4s, v8.4s\n"
- "zip2 v8.4s, v16.4s, v8.4s\n"
- "zip1 v16.4s, v7.4s, v22.4s\n"
- "zip2 v22.4s, v7.4s, v22.4s\n"
- "ldr q7, [x23], #0x10\n"
+ "zip2 v16.4s, v21.4s, v16.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ ".inst 0x4ea16a09 // bfcvtn2 v9.8h, v16.4s\n"
+ "zip1 v16.4s, v14.4s, v26.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- ".inst 0x4ea16ad0 // bfcvtn2 v16.8h, v22.4s\n"
- "zip1 v22.4s, v10.4s, v0.4s\n"
- "zip2 v10.4s, v10.4s, v0.4s\n"
- "ldr q0, [x22], #0x10\n"
- ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n"
- ".inst 0x4ea16956 // bfcvtn2 v22.8h, v10.4s\n"
- "zip1 v10.4s, v23.4s, v25.4s\n"
- "zip2 v23.4s, v23.4s, v25.4s\n"
- "ldr q25, [x21], #0x10\n"
- ".inst 0x0ea1694a // bfcvtn v10.4h, v10.4s\n"
- ".inst 0x4ea16aea // bfcvtn2 v10.8h, v23.4s\n"
- "ldr q23, [x20], #0x10\n"
- "str q5, [x27, #0x0]\n"
- "zip1 v5.4s, v4.4s, v17.4s\n"
- "zip2 v4.4s, v4.4s, v17.4s\n"
- "str q6, [x27, #0x10]\n"
- "zip1 v17.4s, v7.4s, v25.4s\n"
- "zip2 v7.4s, v7.4s, v25.4s\n"
- "str q18, [x27, #0x20]\n"
- "zip1 v18.4s, v0.4s, v23.4s\n"
- "zip2 v0.4s, v0.4s, v23.4s\n"
- "str q12, [x27, #0x30]\n"
- "zip1 v6.4s, v20.4s, v8.4s\n"
- "zip1 v23.4s, v24.4s, v19.4s\n"
- "str q1, [x27, #0x40]\n"
- "zip1 v25.4s, v31.4s, v26.4s\n"
- "zip1 v12.4s, v29.4s, v27.4s\n"
- "str q14, [x27, #0x50]\n"
- "zip1 v14.4s, v21.4s, v2.4s\n"
- "zip1 v1.4s, v17.4s, v18.4s\n"
- "str q3, [x27, #0x60]\n"
- "zip1 v3.4s, v7.4s, v0.4s\n"
- ".inst 0x0ea168a5 // bfcvtn v5.4h, v5.4s\n"
- "str q15, [x27, #0x70]\n"
- ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
- "zip2 v20.4s, v20.4s, v8.4s\n"
- "str q11, [x27, #0x80]\n"
- ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
- "zip2 v24.4s, v24.4s, v19.4s\n"
- "str q9, [x27, #0x90]\n"
- ".inst 0x0ea16b28 // bfcvtn v8.4h, v25.4s\n"
- "zip2 v11.4s, v31.4s, v26.4s\n"
- "str q28, [x27, #0xa0]\n"
- ".inst 0x0ea16993 // bfcvtn v19.4h, v12.4s\n"
- "zip2 v27.4s, v29.4s, v27.4s\n"
- "str q30, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip2 v14.4s, v14.4s, v26.4s\n"
+ "ldr q26, [x28], #0x10\n"
+ ".inst 0x4ea169d0 // bfcvtn2 v16.8h, v14.4s\n"
+ "zip1 v14.4s, v28.4s, v12.4s\n"
".inst 0x0ea169ce // bfcvtn v14.4h, v14.4s\n"
- "zip2 v31.4s, v21.4s, v2.4s\n"
- "str q13, [x27, #0x0]\n"
- ".inst 0x0ea1683e // bfcvtn v30.4h, v1.4s\n"
- "zip2 v15.4s, v17.4s, v18.4s\n"
- "str q16, [x27, #0x10]\n"
- ".inst 0x0ea1686c // bfcvtn v12.4h, v3.4s\n"
- "zip2 v9.4s, v7.4s, v0.4s\n"
- "str q22, [x27, #0x20]\n"
+ "zip2 v12.4s, v28.4s, v12.4s\n"
+ "ldr q28, [x27], #0x10\n"
+ ".inst 0x4ea1698e // bfcvtn2 v14.8h, v12.4s\n"
+ "zip1 v12.4s, v21.4s, v28.4s\n"
+ "zip2 v28.4s, v21.4s, v28.4s\n"
+ "zip1 v21.4s, v19.4s, v25.4s\n"
+ ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n"
+ "zip2 v19.4s, v19.4s, v25.4s\n"
+ "ldr q25, [x26], #0x10\n"
+ ".inst 0x4ea16a75 // bfcvtn2 v21.8h, v19.4s\n"
+ "zip1 v19.4s, v26.4s, v25.4s\n"
+ "zip2 v25.4s, v26.4s, v25.4s\n"
+ "zip1 v26.4s, v27.4s, v13.4s\n"
+ ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n"
+ "zip2 v13.4s, v27.4s, v13.4s\n"
+ "ldr q27, [x25], #0x10\n"
+ ".inst 0x4ea169ba // bfcvtn2 v26.8h, v13.4s\n"
+ "zip1 v13.4s, v0.4s, v8.4s\n"
+ ".inst 0x0ea169ad // bfcvtn v13.4h, v13.4s\n"
+ "zip2 v8.4s, v0.4s, v8.4s\n"
+ "ldr q0, [x23], #0x10\n"
+ ".inst 0x4ea1690d // bfcvtn2 v13.8h, v8.4s\n"
+ "zip1 v8.4s, v2.4s, v23.4s\n"
+ ".inst 0x0ea16908 // bfcvtn v8.4h, v8.4s\n"
+ "zip2 v23.4s, v2.4s, v23.4s\n"
+ "ldr q2, [x22], #0x10\n"
+ ".inst 0x4ea16ae8 // bfcvtn2 v8.8h, v23.4s\n"
+ "ldr q23, [x20], #0x10\n"
+ "str q24, [x21, #0x0]\n"
+ "zip1 v24.4s, v27.4s, v2.4s\n"
+ "zip2 v27.4s, v27.4s, v2.4s\n"
+ "zip1 v2.4s, v0.4s, v23.4s\n"
+ "zip2 v23.4s, v0.4s, v23.4s\n"
+ "str q1, [x21, #0x10]\n"
+ "zip1 v0.4s, v12.4s, v19.4s\n"
+ "zip1 v1.4s, v28.4s, v25.4s\n"
+ "str q18, [x21, #0x20]\n"
+ "zip1 v18.4s, v29.4s, v11.4s\n"
+ ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
+ "str q5, [x21, #0x30]\n"
+ "zip1 v5.4s, v20.4s, v6.4s\n"
+ "zip2 v19.4s, v12.4s, v19.4s\n"
+ "str q22, [x21, #0x40]\n"
+ "zip1 v12.4s, v30.4s, v31.4s\n"
+ "zip1 v22.4s, v15.4s, v4.4s\n"
+ "str q17, [x21, #0x50]\n"
+ "zip1 v17.4s, v24.4s, v2.4s\n"
+ ".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
+ "str q7, [x21, #0x60]\n"
+ "zip1 v7.4s, v27.4s, v23.4s\n"
+ "zip2 v25.4s, v28.4s, v25.4s\n"
+ "str q10, [x21, #0x70]\n"
+ ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
+ "zip2 v29.4s, v29.4s, v11.4s\n"
+ "str q3, [x21, #0x80]\n"
+ ".inst 0x0ea168ab // bfcvtn v11.4h, v5.4s\n"
+ "zip2 v10.4s, v20.4s, v6.4s\n"
+ "str q9, [x21, #0x90]\n"
+ ".inst 0x0ea16986 // bfcvtn v6.4h, v12.4s\n"
+ "zip2 v12.4s, v30.4s, v31.4s\n"
+ "str q16, [x21, #0xa0]\n"
+ ".inst 0x0ea16ac5 // bfcvtn v5.4h, v22.4s\n"
+ "zip2 v4.4s, v15.4s, v4.4s\n"
+ "str q14, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ ".inst 0x0ea16a2f // bfcvtn v15.4h, v17.4s\n"
+ "zip2 v20.4s, v24.4s, v2.4s\n"
+ "str q21, [x21, #0x0]\n"
+ ".inst 0x0ea168fc // bfcvtn v28.4h, v7.4s\n"
+ "zip2 v30.4s, v27.4s, v23.4s\n"
+ "str q26, [x21, #0x10]\n"
+ ".inst 0x4ea16a60 // bfcvtn2 v0.8h, v19.4s\n"
+ ".inst 0x4ea16b21 // bfcvtn2 v1.8h, v25.4s\n"
+ "str q13, [x21, #0x20]\n"
+ ".inst 0x4ea16bb2 // bfcvtn2 v18.8h, v29.4s\n"
+ ".inst 0x4ea1694b // bfcvtn2 v11.8h, v10.4s\n"
+ "str q8, [x21, #0x30]\n"
+ ".inst 0x4ea16986 // bfcvtn2 v6.8h, v12.4s\n"
".inst 0x4ea16885 // bfcvtn2 v5.8h, v4.4s\n"
- ".inst 0x4ea16a86 // bfcvtn2 v6.8h, v20.4s\n"
- "str q10, [x27, #0x30]\n"
- ".inst 0x4ea16b17 // bfcvtn2 v23.8h, v24.4s\n"
- ".inst 0x4ea16968 // bfcvtn2 v8.8h, v11.4s\n"
- ".inst 0x4ea16b73 // bfcvtn2 v19.8h, v27.4s\n"
- ".inst 0x4ea16bee // bfcvtn2 v14.8h, v31.4s\n"
- ".inst 0x4ea169fe // bfcvtn2 v30.8h, v15.4s\n"
- ".inst 0x4ea1692c // bfcvtn2 v12.8h, v9.4s\n"
- "str q5, [x27, #0x40]\n"
- "str q6, [x27, #0x50]\n"
- "str q23, [x27, #0x60]\n"
- "str q8, [x27, #0x70]\n"
- "str q19, [x27, #0x80]\n"
- "str q14, [x27, #0x90]\n"
- "str q30, [x27, #0xa0]\n"
- "str q12, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q0, [x21, #0x40]\n"
+ ".inst 0x4ea16a8f // bfcvtn2 v15.8h, v20.4s\n"
+ ".inst 0x4ea16bdc // bfcvtn2 v28.8h, v30.4s\n"
+ "str q1, [x21, #0x50]\n"
+ "str q18, [x21, #0x60]\n"
+ "str q11, [x21, #0x70]\n"
+ "str q6, [x21, #0x80]\n"
+ "str q5, [x21, #0x90]\n"
+ "str q15, [x21, #0xa0]\n"
+ "str q28, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0xc\n"
+ "cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q5, [x9], #0x10\n"
- "ldr q9, [x26], #0x10\n"
- "sub x28, x28, #0xc\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q12, [x24], #0x10\n"
- "cmp x28, #0xc\n"
- "ldr q0, [x23], #0x10\n"
- "ldr q29, [x22], #0x10\n"
- "ldr q23, [x21], #0x10\n"
- "ldr q27, [x20], #0x10\n"
- "ldr q17, [x9], #0x10\n"
- "ldr q31, [x26], #0x10\n"
- "zip1 v16.4s, v5.4s, v19.4s\n"
- "zip1 v7.4s, v9.4s, v12.4s\n"
- "ldr q10, [x25], #0x10\n"
- "ldr q11, [x24], #0x10\n"
- "zip2 v20.4s, v5.4s, v19.4s\n"
- "zip2 v28.4s, v9.4s, v12.4s\n"
- "ldr q21, [x23], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "zip1 v24.4s, v0.4s, v23.4s\n"
- "zip1 v9.4s, v29.4s, v27.4s\n"
- "ldr q18, [x21], #0x10\n"
- "ldr q25, [x20], #0x10\n"
- "zip2 v8.4s, v0.4s, v23.4s\n"
- "zip2 v4.4s, v29.4s, v27.4s\n"
- "ldr q27, [x9], #0x10\n"
+ "ldr q20, [x9], #0x10\n"
+ "ldr q9, [x28], #0x10\n"
+ "sub x24, x24, #0xc\n"
+ "cmp x24, #0xc\n"
+ "ldr q8, [x27], #0x10\n"
+ "ldr q1, [x26], #0x10\n"
+ "zip1 v7.4s, v20.4s, v8.4s\n"
+ "zip1 v19.4s, v9.4s, v1.4s\n"
+ "ldr q6, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip2 v5.4s, v20.4s, v8.4s\n"
+ "zip2 v18.4s, v9.4s, v1.4s\n"
+ "ldr q27, [x22], #0x10\n"
+ "ldr q14, [x20], #0x10\n"
+ "zip1 v26.4s, v6.4s, v27.4s\n"
+ "zip1 v15.4s, v16.4s, v14.4s\n"
+ "ldr q1, [x9], #0x10\n"
+ "ldr q30, [x28], #0x10\n"
+ "zip2 v24.4s, v6.4s, v27.4s\n"
+ "zip2 v25.4s, v16.4s, v14.4s\n"
+ "ldr q13, [x27], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "zip1 v10.4s, v1.4s, v13.4s\n"
+ "zip1 v16.4s, v30.4s, v17.4s\n"
+ "ldr q4, [x25], #0x10\n"
+ "ldr q11, [x23], #0x10\n"
+ "zip2 v0.4s, v1.4s, v13.4s\n"
+ "zip2 v27.4s, v30.4s, v17.4s\n"
+ "ldr q28, [x22], #0x10\n"
+ "ldr q12, [x20], #0x10\n"
+ "zip1 v22.4s, v4.4s, v28.4s\n"
+ "zip1 v13.4s, v11.4s, v12.4s\n"
+ "ldr q31, [x9], #0x10\n"
+ "ldr q17, [x28], #0x10\n"
+ "zip2 v14.4s, v4.4s, v28.4s\n"
+ "zip2 v12.4s, v11.4s, v12.4s\n"
+ "ldr q2, [x27], #0x10\n"
"ldr q3, [x26], #0x10\n"
- "zip1 v2.4s, v17.4s, v10.4s\n"
- "zip1 v6.4s, v31.4s, v11.4s\n"
- "ldr q1, [x25], #0x10\n"
- "ldr q29, [x24], #0x10\n"
- "zip2 v15.4s, v17.4s, v10.4s\n"
- "zip2 v5.4s, v31.4s, v11.4s\n"
- "ldr q31, [x23], #0x10\n"
- "ldr q12, [x22], #0x10\n"
- "zip1 v13.4s, v21.4s, v18.4s\n"
- "zip1 v10.4s, v22.4s, v25.4s\n"
- "ldr q19, [x21], #0x10\n"
- "ldr q23, [x20], #0x10\n"
- "zip2 v14.4s, v21.4s, v18.4s\n"
- "zip2 v30.4s, v22.4s, v25.4s\n"
- "zip1 v0.4s, v27.4s, v1.4s\n"
- "zip1 v26.4s, v3.4s, v29.4s\n"
- "zip2 v1.4s, v27.4s, v1.4s\n"
- "zip2 v11.4s, v3.4s, v29.4s\n"
- "zip1 v18.4s, v31.4s, v19.4s\n"
- "zip1 v17.4s, v12.4s, v23.4s\n"
- "zip2 v21.4s, v31.4s, v19.4s\n"
- "zip2 v27.4s, v12.4s, v23.4s\n"
- "zip1 v3.4s, v16.4s, v7.4s\n"
- "zip1 v23.4s, v20.4s, v28.4s\n"
- "zip1 v25.4s, v2.4s, v6.4s\n"
- "zip1 v12.4s, v15.4s, v5.4s\n"
- "zip1 v31.4s, v0.4s, v26.4s\n"
- "zip1 v19.4s, v1.4s, v11.4s\n"
- "zip1 v29.4s, v24.4s, v9.4s\n"
- "zip1 v22.4s, v8.4s, v4.4s\n"
- ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
- "zip2 v16.4s, v16.4s, v7.4s\n"
- "zip1 v7.4s, v13.4s, v10.4s\n"
- ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
- "zip2 v20.4s, v20.4s, v28.4s\n"
- "zip1 v28.4s, v14.4s, v30.4s\n"
- ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n"
- "zip2 v6.4s, v2.4s, v6.4s\n"
- "zip1 v2.4s, v18.4s, v17.4s\n"
- ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n"
- "zip2 v5.4s, v15.4s, v5.4s\n"
- "zip1 v15.4s, v21.4s, v27.4s\n"
+ "zip1 v8.4s, v31.4s, v2.4s\n"
+ "zip1 v4.4s, v17.4s, v3.4s\n"
+ "ldr q23, [x25], #0x10\n"
+ "ldr q1, [x23], #0x10\n"
+ "zip2 v28.4s, v31.4s, v2.4s\n"
+ "zip2 v29.4s, v17.4s, v3.4s\n"
+ "ldr q11, [x22], #0x10\n"
+ "ldr q17, [x20], #0x10\n"
+ "zip1 v9.4s, v23.4s, v11.4s\n"
+ "zip1 v21.4s, v1.4s, v17.4s\n"
+ "zip2 v11.4s, v23.4s, v11.4s\n"
+ "zip2 v17.4s, v1.4s, v17.4s\n"
+ "zip1 v2.4s, v7.4s, v19.4s\n"
+ "zip1 v31.4s, v5.4s, v18.4s\n"
+ "zip1 v3.4s, v10.4s, v16.4s\n"
+ "zip1 v6.4s, v0.4s, v27.4s\n"
+ "zip1 v1.4s, v8.4s, v4.4s\n"
+ "zip1 v30.4s, v28.4s, v29.4s\n"
+ "zip1 v20.4s, v26.4s, v15.4s\n"
+ "zip1 v23.4s, v24.4s, v25.4s\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ "zip2 v7.4s, v7.4s, v19.4s\n"
+ "zip1 v19.4s, v22.4s, v13.4s\n"
".inst 0x0ea16bff // bfcvtn v31.4h, v31.4s\n"
- "zip2 v26.4s, v0.4s, v26.4s\n"
- ".inst 0x0ea16a60 // bfcvtn v0.4h, v19.4s\n"
- "zip2 v1.4s, v1.4s, v11.4s\n"
- ".inst 0x0ea16bab // bfcvtn v11.4h, v29.4s\n"
- "zip2 v9.4s, v24.4s, v9.4s\n"
- ".inst 0x0ea16ad3 // bfcvtn v19.4h, v22.4s\n"
- "zip2 v24.4s, v8.4s, v4.4s\n"
- ".inst 0x0ea168e8 // bfcvtn v8.4h, v7.4s\n"
- "zip2 v4.4s, v13.4s, v10.4s\n"
- ".inst 0x0ea16b96 // bfcvtn v22.4h, v28.4s\n"
- "zip2 v29.4s, v14.4s, v30.4s\n"
- ".inst 0x0ea1685c // bfcvtn v28.4h, v2.4s\n"
- "zip2 v14.4s, v18.4s, v17.4s\n"
- ".inst 0x0ea169f2 // bfcvtn v18.4h, v15.4s\n"
- "zip2 v17.4s, v21.4s, v27.4s\n"
+ "zip2 v18.4s, v5.4s, v18.4s\n"
+ "zip1 v5.4s, v14.4s, v12.4s\n"
+ ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
+ "zip2 v16.4s, v10.4s, v16.4s\n"
+ "zip1 v10.4s, v9.4s, v21.4s\n"
+ ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
+ "zip2 v0.4s, v0.4s, v27.4s\n"
+ "zip1 v27.4s, v11.4s, v17.4s\n"
+ ".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
+ "zip2 v4.4s, v8.4s, v4.4s\n"
+ ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
+ "zip2 v29.4s, v28.4s, v29.4s\n"
+ ".inst 0x0ea16a9c // bfcvtn v28.4h, v20.4s\n"
+ "zip2 v15.4s, v26.4s, v15.4s\n"
+ ".inst 0x0ea16ae8 // bfcvtn v8.4h, v23.4s\n"
+ "zip2 v26.4s, v24.4s, v25.4s\n"
+ ".inst 0x0ea16a79 // bfcvtn v25.4h, v19.4s\n"
+ "zip2 v24.4s, v22.4s, v13.4s\n"
+ ".inst 0x0ea168b7 // bfcvtn v23.4h, v5.4s\n"
+ "zip2 v22.4s, v14.4s, v12.4s\n"
+ ".inst 0x0ea16945 // bfcvtn v5.4h, v10.4s\n"
+ "zip2 v20.4s, v9.4s, v21.4s\n"
+ ".inst 0x0ea16b73 // bfcvtn v19.4h, v27.4s\n"
+ "zip2 v17.4s, v11.4s, v17.4s\n"
+ ".inst 0x4ea168e2 // bfcvtn2 v2.8h, v7.4s\n"
+ ".inst 0x4ea16a5f // bfcvtn2 v31.8h, v18.4s\n"
+ "str q2, [x21, #0x0]\n"
".inst 0x4ea16a03 // bfcvtn2 v3.8h, v16.4s\n"
- ".inst 0x4ea16a97 // bfcvtn2 v23.8h, v20.4s\n"
- ".inst 0x4ea168d9 // bfcvtn2 v25.8h, v6.4s\n"
- ".inst 0x4ea168ac // bfcvtn2 v12.8h, v5.4s\n"
- ".inst 0x4ea16b5f // bfcvtn2 v31.8h, v26.4s\n"
- ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x4ea1692b // bfcvtn2 v11.8h, v9.4s\n"
- ".inst 0x4ea16b13 // bfcvtn2 v19.8h, v24.4s\n"
- "str q3, [x27, #0x0]\n"
- ".inst 0x4ea16888 // bfcvtn2 v8.8h, v4.4s\n"
- ".inst 0x4ea16bb6 // bfcvtn2 v22.8h, v29.4s\n"
- "str q23, [x27, #0x10]\n"
- ".inst 0x4ea169dc // bfcvtn2 v28.8h, v14.4s\n"
- ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n"
- "str q25, [x27, #0x20]\n"
- "str q12, [x27, #0x30]\n"
- "str q31, [x27, #0x40]\n"
- "str q0, [x27, #0x50]\n"
- "str q11, [x27, #0x60]\n"
- "str q19, [x27, #0x70]\n"
- "str q8, [x27, #0x80]\n"
- "str q22, [x27, #0x90]\n"
- "str q28, [x27, #0xa0]\n"
- "str q18, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
+ ".inst 0x4ea16806 // bfcvtn2 v6.8h, v0.4s\n"
+ "str q31, [x21, #0x10]\n"
+ ".inst 0x4ea16881 // bfcvtn2 v1.8h, v4.4s\n"
+ ".inst 0x4ea16bbe // bfcvtn2 v30.8h, v29.4s\n"
+ "str q3, [x21, #0x20]\n"
+ ".inst 0x4ea169fc // bfcvtn2 v28.8h, v15.4s\n"
+ ".inst 0x4ea16b48 // bfcvtn2 v8.8h, v26.4s\n"
+ "str q6, [x21, #0x30]\n"
+ ".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
+ ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
+ "str q1, [x21, #0x40]\n"
+ ".inst 0x4ea16a85 // bfcvtn2 v5.8h, v20.4s\n"
+ ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n"
+ "str q30, [x21, #0x50]\n"
+ "str q28, [x21, #0x60]\n"
+ "str q8, [x21, #0x70]\n"
+ "str q25, [x21, #0x80]\n"
+ "str q23, [x21, #0x90]\n"
+ "str q5, [x21, #0xa0]\n"
+ "str q19, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr q25, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "sub x28, x28, #0x4\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "cmp x28, #0x4\n"
- "ldr q23, [x23], #0x10\n"
- "ldr q19, [x22], #0x10\n"
- "ldr q18, [x21], #0x10\n"
- "ldr q17, [x20], #0x10\n"
- "zip1 v22.4s, v25.4s, v21.4s\n"
- "zip1 v16.4s, v24.4s, v20.4s\n"
- "zip2 v21.4s, v25.4s, v21.4s\n"
- "zip2 v20.4s, v24.4s, v20.4s\n"
- "zip1 v27.4s, v23.4s, v18.4s\n"
- "zip1 v26.4s, v19.4s, v17.4s\n"
- "zip2 v25.4s, v23.4s, v18.4s\n"
- "zip2 v24.4s, v19.4s, v17.4s\n"
- "zip1 v19.4s, v22.4s, v16.4s\n"
- "zip1 v18.4s, v21.4s, v20.4s\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v23.4s, v17.4s\n"
+ "zip1 v21.4s, v20.4s, v16.4s\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v28.4s, v23.4s, v17.4s\n"
+ "zip2 v20.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v27.4s, v19.4s, v17.4s\n"
+ "zip1 v26.4s, v18.4s, v16.4s\n"
+ "zip2 v25.4s, v19.4s, v17.4s\n"
+ "zip2 v24.4s, v18.4s, v16.4s\n"
+ "zip1 v19.4s, v22.4s, v21.4s\n"
+ "zip1 v18.4s, v28.4s, v20.4s\n"
"zip1 v17.4s, v27.4s, v26.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
"zip1 v16.4s, v25.4s, v24.4s\n"
- "zip2 v22.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
- ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
+ ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
+ "zip2 v22.4s, v22.4s, v21.4s\n"
+ ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
+ "zip2 v20.4s, v28.4s, v20.4s\n"
".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
"zip2 v18.4s, v27.4s, v26.4s\n"
".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
"zip2 v16.4s, v25.4s, v24.4s\n"
- ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
- ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
+ ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
+ ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q23, [x21, #0x0]\n"
".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x60]\n"
- "str q17, [x27, #0x70]\n"
- "add x27, x27, #0x20\n"
+ "str q21, [x21, #0x10]\n"
+ "str q19, [x21, #0x60]\n"
+ "str q17, [x21, #0x70]\n"
+ "add x21, x21, #0x20\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "sub x28, x28, #0x1\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s17, [x24], #0x4\n"
- "cmp x28, #0x1\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v19.4s, v23.4s, v19.4s\n"
- "zip1 v17.4s, v22.4s, v17.4s\n"
- "zip1 v18.4s, v21.4s, v18.4s\n"
- "zip1 v16.4s, v20.4s, v16.4s\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.4s, v19.4s, v17.4s\n"
"zip1 v16.4s, v18.4s, v16.4s\n"
- ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
+ ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "zip1 v17.4s, v20.4s, v17.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- "str d17, [x27, #0x0]\n"
- "str d16, [x27, #0x60]\n"
- "add x27, x27, #0x8\n"
+ "str d18, [x21, #0x0]\n"
+ "str d16, [x21, #0x60]\n"
+ "add x21, x21, #0x8\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0xc0\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x9, %x[in]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x26, x27, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "csel x25, x25, %x[pad_row], GE\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x26, %x[in_stride]\n"
"csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x18\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q24, [x9], #0x10\n"
- "ldr q20, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q22, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
"sub x20, x20, #0x18\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x18\n"
- "ldr q25, [x9], #0x10\n"
- "ldr q23, [x26], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "ldr q27, [x9], #0x10\n"
- "zip1 v21.4s, v24.4s, v19.4s\n"
- "zip1 v6.4s, v20.4s, v17.4s\n"
- "ldr q26, [x26], #0x10\n"
- "ldr q22, [x25], #0x10\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v0.4s, v20.4s, v17.4s\n"
- "ldr q20, [x24], #0x10\n"
- "ldr q3, [x9], #0x10\n"
- "zip1 v17.4s, v25.4s, v18.4s\n"
- "zip1 v4.4s, v23.4s, v16.4s\n"
- "ldr q1, [x26], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "zip2 v5.4s, v25.4s, v18.4s\n"
- "zip2 v16.4s, v23.4s, v16.4s\n"
- "ldr q23, [x24], #0x10\n"
- "ldr q30, [x9], #0x10\n"
- "zip1 v31.4s, v27.4s, v22.4s\n"
- "zip1 v2.4s, v26.4s, v20.4s\n"
- "ldr q25, [x26], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "zip2 v29.4s, v27.4s, v22.4s\n"
- "zip2 v28.4s, v26.4s, v20.4s\n"
- "ldr q20, [x24], #0x10\n"
- "ldr q27, [x9], #0x10\n"
- "zip1 v22.4s, v3.4s, v24.4s\n"
- "zip1 v26.4s, v1.4s, v23.4s\n"
- "ldr q7, [x26], #0x10\n"
- "ldr q11, [x25], #0x10\n"
- "zip2 v24.4s, v3.4s, v24.4s\n"
- "zip2 v14.4s, v1.4s, v23.4s\n"
- "ldr q3, [x24], #0x10\n"
- "zip1 v23.4s, v30.4s, v18.4s\n"
- "zip1 v13.4s, v25.4s, v20.4s\n"
- "zip2 v18.4s, v30.4s, v18.4s\n"
- "zip2 v12.4s, v25.4s, v20.4s\n"
- "zip1 v15.4s, v27.4s, v11.4s\n"
- "zip1 v9.4s, v7.4s, v3.4s\n"
- "zip2 v8.4s, v27.4s, v11.4s\n"
- "zip2 v10.4s, v7.4s, v3.4s\n"
- "zip1 v11.4s, v21.4s, v6.4s\n"
- "zip1 v7.4s, v19.4s, v0.4s\n"
- "zip1 v1.4s, v17.4s, v4.4s\n"
- "zip1 v3.4s, v5.4s, v16.4s\n"
- "zip1 v30.4s, v31.4s, v2.4s\n"
- "zip1 v25.4s, v29.4s, v28.4s\n"
- "zip1 v27.4s, v22.4s, v26.4s\n"
- "zip1 v20.4s, v24.4s, v14.4s\n"
- ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
- "zip2 v6.4s, v21.4s, v6.4s\n"
- "zip1 v21.4s, v23.4s, v13.4s\n"
- ".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n"
- "zip2 v0.4s, v19.4s, v0.4s\n"
- "zip1 v19.4s, v18.4s, v12.4s\n"
- ".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
- "zip2 v4.4s, v17.4s, v4.4s\n"
- "zip1 v17.4s, v15.4s, v9.4s\n"
- ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
- "zip2 v5.4s, v5.4s, v16.4s\n"
- "zip1 v16.4s, v8.4s, v10.4s\n"
- ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
- "zip2 v2.4s, v31.4s, v2.4s\n"
- ".inst 0x0ea16b3f // bfcvtn v31.4h, v25.4s\n"
- "zip2 v29.4s, v29.4s, v28.4s\n"
- ".inst 0x0ea16b7c // bfcvtn v28.4h, v27.4s\n"
- "zip2 v27.4s, v22.4s, v26.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v19.4s, v22.4s, v17.4s\n"
+ "zip1 v21.4s, v18.4s, v16.4s\n"
+ "ldr q24, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v10.4s, v22.4s, v17.4s\n"
+ "zip2 v2.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v24.4s, v17.4s\n"
+ "zip1 v4.4s, v20.4s, v16.4s\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v29.4s, v24.4s, v17.4s\n"
+ "zip2 v1.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v30.4s, v23.4s, v17.4s\n"
+ "zip1 v31.4s, v18.4s, v16.4s\n"
+ "ldr q24, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v23.4s, v23.4s, v17.4s\n"
+ "zip2 v28.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v25.4s, v24.4s, v17.4s\n"
+ "zip1 v26.4s, v20.4s, v16.4s\n"
+ "ldr q14, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v24.4s, v24.4s, v17.4s\n"
+ "zip2 v15.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v12.4s, v14.4s, v17.4s\n"
+ "zip1 v13.4s, v18.4s, v16.4s\n"
+ "ldr q7, [x9], #0x10\n"
+ "ldr q3, [x28], #0x10\n"
+ "zip2 v0.4s, v14.4s, v17.4s\n"
+ "zip2 v9.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v14.4s, v7.4s, v17.4s\n"
+ "zip1 v8.4s, v3.4s, v16.4s\n"
+ "zip2 v7.4s, v7.4s, v17.4s\n"
+ "zip2 v11.4s, v3.4s, v16.4s\n"
+ "zip1 v18.4s, v19.4s, v21.4s\n"
+ "zip1 v6.4s, v10.4s, v2.4s\n"
+ "zip1 v5.4s, v22.4s, v4.4s\n"
+ "zip1 v16.4s, v29.4s, v1.4s\n"
+ "zip1 v27.4s, v30.4s, v31.4s\n"
+ "zip1 v3.4s, v23.4s, v28.4s\n"
+ "zip1 v17.4s, v25.4s, v26.4s\n"
+ "zip1 v20.4s, v24.4s, v15.4s\n"
+ ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
+ "zip2 v19.4s, v19.4s, v21.4s\n"
+ "zip1 v21.4s, v12.4s, v13.4s\n"
+ ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
+ "zip2 v10.4s, v10.4s, v2.4s\n"
+ "zip1 v2.4s, v0.4s, v9.4s\n"
+ ".inst 0x0ea168a5 // bfcvtn v5.4h, v5.4s\n"
+ "zip2 v4.4s, v22.4s, v4.4s\n"
+ "zip1 v22.4s, v14.4s, v8.4s\n"
+ ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ "zip2 v1.4s, v29.4s, v1.4s\n"
+ "zip1 v29.4s, v7.4s, v11.4s\n"
+ ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n"
+ "zip2 v30.4s, v30.4s, v31.4s\n"
+ ".inst 0x0ea1687f // bfcvtn v31.4h, v3.4s\n"
+ "zip2 v23.4s, v23.4s, v28.4s\n"
+ ".inst 0x0ea16a23 // bfcvtn v3.4h, v17.4s\n"
+ "zip2 v28.4s, v25.4s, v26.4s\n"
".inst 0x0ea16a9a // bfcvtn v26.4h, v20.4s\n"
- "zip2 v25.4s, v24.4s, v14.4s\n"
+ "zip2 v25.4s, v24.4s, v15.4s\n"
".inst 0x0ea16ab8 // bfcvtn v24.4h, v21.4s\n"
- "zip2 v22.4s, v23.4s, v13.4s\n"
- ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
- "zip2 v20.4s, v18.4s, v12.4s\n"
- ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
- "zip2 v18.4s, v15.4s, v9.4s\n"
- ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
- "zip2 v16.4s, v8.4s, v10.4s\n"
- ".inst 0x4ea168cb // bfcvtn2 v11.8h, v6.4s\n"
- ".inst 0x4ea16807 // bfcvtn2 v7.8h, v0.4s\n"
- ".inst 0x4ea16881 // bfcvtn2 v1.8h, v4.4s\n"
- ".inst 0x4ea168a3 // bfcvtn2 v3.8h, v5.4s\n"
- ".inst 0x4ea1685e // bfcvtn2 v30.8h, v2.4s\n"
- ".inst 0x4ea16bbf // bfcvtn2 v31.8h, v29.4s\n"
- "str q11, [x27, #0x0]\n"
- "str q7, [x27, #0x10]\n"
- ".inst 0x4ea16b7c // bfcvtn2 v28.8h, v27.4s\n"
+ "zip2 v12.4s, v12.4s, v13.4s\n"
+ ".inst 0x0ea16855 // bfcvtn v21.4h, v2.4s\n"
+ "zip2 v13.4s, v0.4s, v9.4s\n"
+ ".inst 0x0ea16ac2 // bfcvtn v2.4h, v22.4s\n"
+ "zip2 v0.4s, v14.4s, v8.4s\n"
+ ".inst 0x0ea16ba9 // bfcvtn v9.4h, v29.4s\n"
+ "zip2 v17.4s, v7.4s, v11.4s\n"
+ ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
+ ".inst 0x4ea16946 // bfcvtn2 v6.8h, v10.4s\n"
+ "str q18, [x21, #0x0]\n"
+ ".inst 0x4ea16885 // bfcvtn2 v5.8h, v4.4s\n"
+ ".inst 0x4ea16830 // bfcvtn2 v16.8h, v1.4s\n"
+ "str q6, [x21, #0x10]\n"
+ ".inst 0x4ea16bdb // bfcvtn2 v27.8h, v30.4s\n"
+ ".inst 0x4ea16aff // bfcvtn2 v31.8h, v23.4s\n"
+ "str q5, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ ".inst 0x4ea16b83 // bfcvtn2 v3.8h, v28.4s\n"
".inst 0x4ea16b3a // bfcvtn2 v26.8h, v25.4s\n"
- "str q1, [x27, #0x20]\n"
- ".inst 0x4ea16ad8 // bfcvtn2 v24.8h, v22.4s\n"
- ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
- "str q3, [x27, #0x30]\n"
- ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
- ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q30, [x27, #0x40]\n"
- "str q31, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q28, [x27, #0x0]\n"
- "str q26, [x27, #0x10]\n"
- "str q24, [x27, #0x20]\n"
- "str q21, [x27, #0x30]\n"
- "str q19, [x27, #0x40]\n"
- "str q17, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "str q27, [x21, #0x40]\n"
+ ".inst 0x4ea16998 // bfcvtn2 v24.8h, v12.4s\n"
+ ".inst 0x4ea169b5 // bfcvtn2 v21.8h, v13.4s\n"
+ "str q31, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ ".inst 0x4ea16802 // bfcvtn2 v2.8h, v0.4s\n"
+ ".inst 0x4ea16a29 // bfcvtn2 v9.8h, v17.4s\n"
+ "str q3, [x21, #0x0]\n"
+ "str q26, [x21, #0x10]\n"
+ "str q24, [x21, #0x20]\n"
+ "str q21, [x21, #0x30]\n"
+ "str q2, [x21, #0x40]\n"
+ "str q9, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
- "ldr q24, [x9], #0x10\n"
- "ldr q23, [x26], #0x10\n"
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
"sub x20, x20, #0xc\n"
- "ldr q22, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
"cmp x20, #0xc\n"
- "ldr q28, [x9], #0x10\n"
- "ldr q27, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v26.4s, v19.4s, v17.4s\n"
+ "zip1 v25.4s, v18.4s, v16.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v24.4s, v19.4s, v17.4s\n"
+ "zip2 v23.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v2.4s, v21.4s, v17.4s\n"
+ "zip1 v22.4s, v20.4s, v16.4s\n"
"ldr q19, [x9], #0x10\n"
- "zip1 v26.4s, v24.4s, v22.4s\n"
- "zip1 v25.4s, v23.4s, v16.4s\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "zip2 v24.4s, v24.4s, v22.4s\n"
- "zip2 v23.4s, v23.4s, v16.4s\n"
- "ldr q16, [x24], #0x10\n"
- "zip1 v2.4s, v28.4s, v21.4s\n"
- "zip1 v22.4s, v27.4s, v20.4s\n"
- "zip2 v1.4s, v28.4s, v21.4s\n"
- "zip2 v0.4s, v27.4s, v20.4s\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v1.4s, v21.4s, v17.4s\n"
+ "zip2 v0.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
"zip1 v31.4s, v19.4s, v17.4s\n"
"zip1 v30.4s, v18.4s, v16.4s\n"
"zip2 v29.4s, v19.4s, v17.4s\n"
@@ -665,75 +650,66 @@ void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, si
"zip2 v16.4s, v29.4s, v28.4s\n"
".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
+ "str q27, [x21, #0x0]\n"
".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q25, [x21, #0x10]\n"
".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q27, [x27, #0x0]\n"
- "str q25, [x27, #0x10]\n"
- "str q23, [x27, #0x20]\n"
- "str q21, [x27, #0x30]\n"
- "str q19, [x27, #0x40]\n"
- "str q17, [x27, #0x50]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q23, [x21, #0x20]\n"
+ "str q21, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q17, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
- "ldr q21, [x9], #0x10\n"
- "ldr q20, [x26], #0x10\n"
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
+ "ldr q20, [x9], #0x10\n"
+ "ldr q19, [x28], #0x10\n"
"sub x20, x20, #0x4\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x4\n"
- "zip1 v18.4s, v21.4s, v19.4s\n"
- "zip1 v16.4s, v20.4s, v17.4s\n"
- "zip2 v21.4s, v21.4s, v19.4s\n"
- "zip2 v20.4s, v20.4s, v17.4s\n"
- "zip1 v17.4s, v18.4s, v16.4s\n"
- "zip2 v19.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v20.4s, v17.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "zip2 v21.4s, v20.4s, v17.4s\n"
+ "zip2 v20.4s, v19.4s, v16.4s\n"
+ "zip1 v17.4s, v22.4s, v18.4s\n"
"zip1 v16.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
- ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n"
- "str q18, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
+ "zip2 v18.4s, v22.4s, v18.4s\n"
+ ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
+ "zip2 v16.4s, v21.4s, v20.4s\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
+ "str q19, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr s19, [x9], #0x4\n"
- "ldr s18, [x26], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
"sub x20, x20, #0x1\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s16, [x24], #0x4\n"
"cmp x20, #0x1\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.4s, v19.4s, v17.4s\n"
"zip1 v16.4s, v18.4s, v16.4s\n"
"zip1 v16.4s, v17.4s, v16.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x60\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
index ee37b4378f..0d6f8b1cd4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
@@ -34,236 +34,222 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
"ldr q3, [x25], #0x10\n"
- "ldr q21, [x22], #0x10\n"
+ "ldr q21, [x23], #0x10\n"
+ "sshll2 v20.8h, v3.16b, #0x0\n"
+ "sshll v2.8h, v21.8b, #0x0\n"
+ "ldr q1, [x22], #0x10\n"
+ "ldr q19, [x20], #0x10\n"
+ "sshll2 v18.8h, v1.16b, #0x0\n"
+ "sshll v0.8h, v19.8b, #0x0\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "sshll v31.8h, v17.8b, #0x0\n"
+ "sshll v30.8h, v16.8b, #0x0\n"
+ "ldr d29, [x23], #0x8\n"
+ "ldr d28, [x20], #0x8\n"
+ "sshll2 v27.8h, v21.16b, #0x0\n"
+ "sshll2 v26.8h, v19.16b, #0x0\n"
+ "dup v25.2d, v20.d[0]\n"
+ "dup v24.2d, v2.d[1]\n"
"sub x24, x24, #0x18\n"
- "ldr q2, [x21], #0x10\n"
- "ldr q20, [x20], #0x10\n"
"cmp x24, #0x18\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d1, [x22], #0x8\n"
- "sshll2 v18.8h, v3.16b, #0x0\n"
- "sshll v0.8h, v21.8b, #0x0\n"
- "ldr d31, [x20], #0x8\n"
- "sshll2 v16.8h, v2.16b, #0x0\n"
- "sshll v30.8h, v20.8b, #0x0\n"
- "sshll v29.8h, v19.8b, #0x0\n"
- "sshll v28.8h, v17.8b, #0x0\n"
- "sshll2 v27.8h, v21.16b, #0x0\n"
- "sshll2 v26.8h, v20.16b, #0x0\n"
- "dup v17.2d, v18.d[0]\n"
- "dup v25.2d, v0.d[1]\n"
- "dup v24.2d, v16.d[0]\n"
- "dup v23.2d, v30.d[1]\n"
- "dup v22.2d, v18.d[1]\n"
- "dup v21.2d, v29.d[1]\n"
- "dup v20.2d, v16.d[1]\n"
- "dup v19.2d, v28.d[1]\n"
- "sshll v16.8h, v3.8b, #0x0\n"
- "sshll v18.8h, v2.8b, #0x0\n"
- "mov v17.d[1], v0.d[0]\n"
- "mov v25.d[1], v27.d[0]\n"
- "mov v24.d[1], v30.d[0]\n"
- "mov v23.d[1], v26.d[0]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "sshll v17.8h, v1.8b, #0x0\n"
- "sshll v16.8h, v31.8b, #0x0\n"
- "str q25, [x23, #0x20]\n"
- "mov v22.d[1], v29.d[0]\n"
- "mov v21.d[1], v27.d[1]\n"
- "str q18, [x23, #0x30]\n"
- "mov v20.d[1], v28.d[0]\n"
- "mov v19.d[1], v26.d[1]\n"
- "str q24, [x23, #0x40]\n"
- "str q23, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q22, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q20, [x23, #0x30]\n"
- "str q19, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
+ "dup v23.2d, v18.d[0]\n"
+ "dup v22.2d, v0.d[1]\n"
+ "dup v21.2d, v20.d[1]\n"
+ "dup v20.2d, v31.d[1]\n"
+ "dup v19.2d, v18.d[1]\n"
+ "dup v18.2d, v30.d[1]\n"
+ "sshll v17.8h, v3.8b, #0x0\n"
+ "sshll v16.8h, v1.8b, #0x0\n"
+ "str q17, [x21, #0x0]\n"
+ "mov v25.d[1], v2.d[0]\n"
+ "mov v24.d[1], v27.d[0]\n"
+ "str q25, [x21, #0x10]\n"
+ "mov v23.d[1], v0.d[0]\n"
+ "mov v22.d[1], v26.d[0]\n"
+ "str q24, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "sshll v17.8h, v29.8b, #0x0\n"
+ "sshll v16.8h, v28.8b, #0x0\n"
+ "str q23, [x21, #0x40]\n"
+ "mov v21.d[1], v31.d[0]\n"
+ "mov v20.d[1], v27.d[1]\n"
+ "str q22, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "mov v19.d[1], v30.d[0]\n"
+ "mov v18.d[1], v26.d[1]\n"
+ "str q21, [x21, #0x0]\n"
+ "str q20, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q18, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr d19, [x22], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"ldr d18, [x20], #0x8\n"
"sub x24, x24, #0xc\n"
- "ld1 { v19.s }[2], [x22], #0x4\n"
- "ldr d17, [x25], #0x8\n"
"cmp x24, #0xc\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
"ld1 { v18.s }[2], [x20], #0x4\n"
- "ldr d16, [x21], #0x8\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v16.s }[2], [x21], #0x4\n"
"sshll v25.8h, v19.8b, #0x0\n"
- "sshll2 v24.8h, v19.16b, #0x0\n"
- "sshll v23.8h, v18.8b, #0x0\n"
+ "sshll v24.8h, v18.8b, #0x0\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "sshll2 v23.8h, v19.16b, #0x0\n"
"sshll2 v22.8h, v18.16b, #0x0\n"
+ "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x22], #0x4\n"
"sshll2 v21.8h, v17.16b, #0x0\n"
"sshll2 v20.8h, v16.16b, #0x0\n"
"dup v19.2d, v25.d[1]\n"
- "sshll v18.8h, v17.8b, #0x0\n"
- "dup v17.2d, v23.d[1]\n"
+ "dup v18.2d, v24.d[1]\n"
+ "sshll v17.8h, v17.8b, #0x0\n"
"sshll v16.8h, v16.8b, #0x0\n"
+ "str q17, [x21, #0x0]\n"
"mov v21.d[1], v25.d[0]\n"
- "mov v19.d[1], v24.d[0]\n"
- "mov v20.d[1], v23.d[0]\n"
- "mov v17.d[1], v22.d[0]\n"
- "str q18, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q19, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q20, [x23, #0x40]\n"
- "str q17, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
+ "mov v19.d[1], v23.d[0]\n"
+ "str q21, [x21, #0x10]\n"
+ "mov v20.d[1], v24.d[0]\n"
+ "mov v18.d[1], v22.d[0]\n"
+ "str q19, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "str q20, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr s19, [x25], #0x4\n"
- "ldr s18, [x22], #0x4\n"
+ "ldr s18, [x23], #0x4\n"
"sub x24, x24, #0x4\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
"cmp x24, #0x4\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
"sshll v19.8h, v19.8b, #0x0\n"
"sshll v18.8h, v18.8b, #0x0\n"
"sshll v17.8h, v17.8b, #0x0\n"
"sshll v16.8h, v16.8b, #0x0\n"
- "str d19, [x23, #0x0]\n"
- "str d18, [x23, #0x18]\n"
- "str d17, [x23, #0x30]\n"
- "str d16, [x23, #0x48]\n"
- "add x23, x23, #0x8\n"
+ "str d19, [x21, #0x0]\n"
+ "str d18, [x21, #0x18]\n"
+ "str d17, [x21, #0x30]\n"
+ "str d16, [x21, #0x48]\n"
+ "add x21, x21, #0x8\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr b19, [x25], #0x1\n"
- "ldr b18, [x22], #0x1\n"
+ "ldr b18, [x23], #0x1\n"
"sub x24, x24, #0x1\n"
- "ldr b17, [x21], #0x1\n"
- "ldr b16, [x20], #0x1\n"
"cmp x24, #0x1\n"
+ "ldr b17, [x22], #0x1\n"
+ "ldr b16, [x20], #0x1\n"
"sshll v19.8h, v19.8b, #0x0\n"
"sshll v18.8h, v18.8b, #0x0\n"
"sshll v17.8h, v17.8b, #0x0\n"
"sshll v16.8h, v16.8b, #0x0\n"
- "str h19, [x23, #0x0]\n"
- "str h18, [x23, #0x18]\n"
- "str h17, [x23, #0x30]\n"
- "str h16, [x23, #0x48]\n"
- "add x23, x23, #0x2\n"
+ "str h19, [x21, #0x0]\n"
+ "str h18, [x21, #0x18]\n"
+ "str h17, [x21, #0x30]\n"
+ "str h16, [x21, #0x48]\n"
+ "add x21, x21, #0x2\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x60\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q18, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q20, [x25], #0x10\n"
"ldr d16, [x25], #0x8\n"
+ "sshll2 v19.8h, v20.16b, #0x0\n"
+ "sshll v18.8h, v16.8b, #0x0\n"
+ "dup v17.2d, v19.d[1]\n"
+ "sub x20, x20, #0x18\n"
+ "sshll v16.8h, v20.8b, #0x0\n"
+ "str q16, [x21, #0x0]\n"
+ "dup v16.2d, v19.d[0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
"cmp x20, #0x18\n"
- "sshll2 v17.8h, v18.16b, #0x0\n"
- "sshll v16.8h, v16.8b, #0x0\n"
- "sshll v19.8h, v18.8b, #0x0\n"
- "dup v18.2d, v17.d[1]\n"
- "dup v17.2d, v17.d[0]\n"
- "mov v18.d[1], v16.d[0]\n"
- "dup v16.2d, v16.d[1]\n"
- "str q19, [x23, #0x0]\n"
- "str d17, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q18, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "mov v17.d[1], v18.d[0]\n"
+ "dup v16.2d, v18.d[1]\n"
+ "str q17, [x21, #0x0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr d16, [x25], #0x8\n"
- "sub x20, x20, #0xc\n"
"ld1 { v16.s }[2], [x25], #0x4\n"
+ "sub x20, x20, #0xc\n"
"cmp x20, #0xc\n"
"sshll v17.8h, v16.8b, #0x0\n"
"sshll2 v16.8h, v16.16b, #0x0\n"
- "str q17, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q17, [x21, #0x0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr s16, [x25], #0x4\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"sshll v16.8h, v16.8b, #0x0\n"
- "str d16, [x23, #0x0]\n"
- "add x23, x23, #0x8\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr b16, [x25], #0x1\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"sshll v16.8h, v16.8b, #0x0\n"
- "str h16, [x23, #0x0]\n"
- "add x23, x23, #0x2\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str h16, [x21, #0x0]\n"
+ "add x21, x21, #0x2\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x18\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
index 3bfa4a0e42..b263e6c41d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
@@ -34,236 +34,222 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
"ldr q3, [x25], #0x10\n"
- "ldr q21, [x22], #0x10\n"
+ "ldr q21, [x23], #0x10\n"
+ "ushll2 v20.8h, v3.16b, #0x0\n"
+ "ushll v2.8h, v21.8b, #0x0\n"
+ "ldr q1, [x22], #0x10\n"
+ "ldr q19, [x20], #0x10\n"
+ "ushll2 v18.8h, v1.16b, #0x0\n"
+ "ushll v0.8h, v19.8b, #0x0\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ushll v31.8h, v17.8b, #0x0\n"
+ "ushll v30.8h, v16.8b, #0x0\n"
+ "ldr d29, [x23], #0x8\n"
+ "ldr d28, [x20], #0x8\n"
+ "ushll2 v27.8h, v21.16b, #0x0\n"
+ "ushll2 v26.8h, v19.16b, #0x0\n"
+ "dup v25.2d, v20.d[0]\n"
+ "dup v24.2d, v2.d[1]\n"
"sub x24, x24, #0x18\n"
- "ldr q2, [x21], #0x10\n"
- "ldr q20, [x20], #0x10\n"
"cmp x24, #0x18\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d1, [x22], #0x8\n"
- "ushll2 v18.8h, v3.16b, #0x0\n"
- "ushll v0.8h, v21.8b, #0x0\n"
- "ldr d31, [x20], #0x8\n"
- "ushll2 v16.8h, v2.16b, #0x0\n"
- "ushll v30.8h, v20.8b, #0x0\n"
- "ushll v29.8h, v19.8b, #0x0\n"
- "ushll v28.8h, v17.8b, #0x0\n"
- "ushll2 v27.8h, v21.16b, #0x0\n"
- "ushll2 v26.8h, v20.16b, #0x0\n"
- "dup v17.2d, v18.d[0]\n"
- "dup v25.2d, v0.d[1]\n"
- "dup v24.2d, v16.d[0]\n"
- "dup v23.2d, v30.d[1]\n"
- "dup v22.2d, v18.d[1]\n"
- "dup v21.2d, v29.d[1]\n"
- "dup v20.2d, v16.d[1]\n"
- "dup v19.2d, v28.d[1]\n"
- "ushll v16.8h, v3.8b, #0x0\n"
- "ushll v18.8h, v2.8b, #0x0\n"
- "mov v17.d[1], v0.d[0]\n"
- "mov v25.d[1], v27.d[0]\n"
- "mov v24.d[1], v30.d[0]\n"
- "mov v23.d[1], v26.d[0]\n"
- "str q16, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "ushll v17.8h, v1.8b, #0x0\n"
- "ushll v16.8h, v31.8b, #0x0\n"
- "str q25, [x23, #0x20]\n"
- "mov v22.d[1], v29.d[0]\n"
- "mov v21.d[1], v27.d[1]\n"
- "str q18, [x23, #0x30]\n"
- "mov v20.d[1], v28.d[0]\n"
- "mov v19.d[1], v26.d[1]\n"
- "str q24, [x23, #0x40]\n"
- "str q23, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q22, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q20, [x23, #0x30]\n"
- "str q19, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
+ "dup v23.2d, v18.d[0]\n"
+ "dup v22.2d, v0.d[1]\n"
+ "dup v21.2d, v20.d[1]\n"
+ "dup v20.2d, v31.d[1]\n"
+ "dup v19.2d, v18.d[1]\n"
+ "dup v18.2d, v30.d[1]\n"
+ "ushll v17.8h, v3.8b, #0x0\n"
+ "ushll v16.8h, v1.8b, #0x0\n"
+ "str q17, [x21, #0x0]\n"
+ "mov v25.d[1], v2.d[0]\n"
+ "mov v24.d[1], v27.d[0]\n"
+ "str q25, [x21, #0x10]\n"
+ "mov v23.d[1], v0.d[0]\n"
+ "mov v22.d[1], v26.d[0]\n"
+ "str q24, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "ushll v17.8h, v29.8b, #0x0\n"
+ "ushll v16.8h, v28.8b, #0x0\n"
+ "str q23, [x21, #0x40]\n"
+ "mov v21.d[1], v31.d[0]\n"
+ "mov v20.d[1], v27.d[1]\n"
+ "str q22, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "mov v19.d[1], v30.d[0]\n"
+ "mov v18.d[1], v26.d[1]\n"
+ "str q21, [x21, #0x0]\n"
+ "str q20, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q18, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr d19, [x22], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
"ldr d18, [x20], #0x8\n"
"sub x24, x24, #0xc\n"
- "ld1 { v19.s }[2], [x22], #0x4\n"
- "ldr d17, [x25], #0x8\n"
"cmp x24, #0xc\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
"ld1 { v18.s }[2], [x20], #0x4\n"
- "ldr d16, [x21], #0x8\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
- "ld1 { v16.s }[2], [x21], #0x4\n"
"ushll v25.8h, v19.8b, #0x0\n"
- "ushll2 v24.8h, v19.16b, #0x0\n"
- "ushll v23.8h, v18.8b, #0x0\n"
+ "ushll v24.8h, v18.8b, #0x0\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ushll2 v23.8h, v19.16b, #0x0\n"
"ushll2 v22.8h, v18.16b, #0x0\n"
+ "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x22], #0x4\n"
"ushll2 v21.8h, v17.16b, #0x0\n"
"ushll2 v20.8h, v16.16b, #0x0\n"
"dup v19.2d, v25.d[1]\n"
- "ushll v18.8h, v17.8b, #0x0\n"
- "dup v17.2d, v23.d[1]\n"
+ "dup v18.2d, v24.d[1]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
"ushll v16.8h, v16.8b, #0x0\n"
+ "str q17, [x21, #0x0]\n"
"mov v21.d[1], v25.d[0]\n"
- "mov v19.d[1], v24.d[0]\n"
- "mov v20.d[1], v23.d[0]\n"
- "mov v17.d[1], v22.d[0]\n"
- "str q18, [x23, #0x0]\n"
- "str q21, [x23, #0x10]\n"
- "str q19, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q20, [x23, #0x40]\n"
- "str q17, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
+ "mov v19.d[1], v23.d[0]\n"
+ "str q21, [x21, #0x10]\n"
+ "mov v20.d[1], v24.d[0]\n"
+ "mov v18.d[1], v22.d[0]\n"
+ "str q19, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "str q20, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr s19, [x25], #0x4\n"
- "ldr s18, [x22], #0x4\n"
+ "ldr s18, [x23], #0x4\n"
"sub x24, x24, #0x4\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
"cmp x24, #0x4\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
"ushll v19.8h, v19.8b, #0x0\n"
"ushll v18.8h, v18.8b, #0x0\n"
"ushll v17.8h, v17.8b, #0x0\n"
"ushll v16.8h, v16.8b, #0x0\n"
- "str d19, [x23, #0x0]\n"
- "str d18, [x23, #0x18]\n"
- "str d17, [x23, #0x30]\n"
- "str d16, [x23, #0x48]\n"
- "add x23, x23, #0x8\n"
+ "str d19, [x21, #0x0]\n"
+ "str d18, [x21, #0x18]\n"
+ "str d17, [x21, #0x30]\n"
+ "str d16, [x21, #0x48]\n"
+ "add x21, x21, #0x8\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr b19, [x25], #0x1\n"
- "ldr b18, [x22], #0x1\n"
+ "ldr b18, [x23], #0x1\n"
"sub x24, x24, #0x1\n"
- "ldr b17, [x21], #0x1\n"
- "ldr b16, [x20], #0x1\n"
"cmp x24, #0x1\n"
+ "ldr b17, [x22], #0x1\n"
+ "ldr b16, [x20], #0x1\n"
"ushll v19.8h, v19.8b, #0x0\n"
"ushll v18.8h, v18.8b, #0x0\n"
"ushll v17.8h, v17.8b, #0x0\n"
"ushll v16.8h, v16.8b, #0x0\n"
- "str h19, [x23, #0x0]\n"
- "str h18, [x23, #0x18]\n"
- "str h17, [x23, #0x30]\n"
- "str h16, [x23, #0x48]\n"
- "add x23, x23, #0x2\n"
+ "str h19, [x21, #0x0]\n"
+ "str h18, [x21, #0x18]\n"
+ "str h17, [x21, #0x30]\n"
+ "str h16, [x21, #0x48]\n"
+ "add x21, x21, #0x2\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x60\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q18, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q20, [x25], #0x10\n"
"ldr d16, [x25], #0x8\n"
+ "ushll2 v19.8h, v20.16b, #0x0\n"
+ "ushll v18.8h, v16.8b, #0x0\n"
+ "dup v17.2d, v19.d[1]\n"
+ "sub x20, x20, #0x18\n"
+ "ushll v16.8h, v20.8b, #0x0\n"
+ "str q16, [x21, #0x0]\n"
+ "dup v16.2d, v19.d[0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
"cmp x20, #0x18\n"
- "ushll2 v17.8h, v18.16b, #0x0\n"
- "ushll v16.8h, v16.8b, #0x0\n"
- "ushll v19.8h, v18.8b, #0x0\n"
- "dup v18.2d, v17.d[1]\n"
- "dup v17.2d, v17.d[0]\n"
- "mov v18.d[1], v16.d[0]\n"
- "dup v16.2d, v16.d[1]\n"
- "str q19, [x23, #0x0]\n"
- "str d17, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q18, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "mov v17.d[1], v18.d[0]\n"
+ "dup v16.2d, v18.d[1]\n"
+ "str q17, [x21, #0x0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr d16, [x25], #0x8\n"
- "sub x20, x20, #0xc\n"
"ld1 { v16.s }[2], [x25], #0x4\n"
+ "sub x20, x20, #0xc\n"
"cmp x20, #0xc\n"
"ushll v17.8h, v16.8b, #0x0\n"
"ushll2 v16.8h, v16.16b, #0x0\n"
- "str q17, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q17, [x21, #0x0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr s16, [x25], #0x4\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"ushll v16.8h, v16.8b, #0x0\n"
- "str d16, [x23, #0x0]\n"
- "add x23, x23, #0x8\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr b16, [x25], #0x1\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"ushll v16.8h, v16.8b, #0x0\n"
- "str h16, [x23, #0x0]\n"
- "add x23, x23, #0x2\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str h16, [x21, #0x0]\n"
+ "add x21, x21, #0x2\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x18\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
index 7c79c5f7f0..087dc923e8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
@@ -40,16 +40,14 @@ void a64_transpose_interleave_16_1x4(uint8_t *out, const uint8_t *in, size_t wid
__asm__ __volatile__(
"cmp %x[height], #0x10\n"
- "blt 9f\n"
+ "blt 8f\n"
"1:" // Main row loop: Head
"mov x17, %x[in]\n"
- "mov x16, %x[width]\n"
- "mov x15, %x[out]\n"
- "sub %x[height], %x[height], #0x10\n"
- "add x14, x17, %x[in_stride]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
+ "add x14, x15, %x[in_stride]\n"
"add x13, x14, %x[in_stride]\n"
"add x12, x13, %x[in_stride]\n"
- "cmp x16, #0x10\n"
"add x11, x12, %x[in_stride]\n"
"add x10, x11, %x[in_stride]\n"
"add x9, x10, %x[in_stride]\n"
@@ -57,268 +55,244 @@ void a64_transpose_interleave_16_1x4(uint8_t *out, const uint8_t *in, size_t wid
"add x27, x28, %x[in_stride]\n"
"add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x10\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x10\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
- "ldr q19, [x17], #0x10\n"
- "ldr q18, [x14], #0x10\n"
- "sub x16, x16, #0x10\n"
- "ldr q17, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
- "cmp x16, #0x10\n"
- "ldr q24, [x11], #0x10\n"
- "ldr q23, [x10], #0x10\n"
- "ldr q22, [x9], #0x10\n"
- "ldr q21, [x28], #0x10\n"
- "ldr q30, [x27], #0x10\n"
- "ldr q29, [x26], #0x10\n"
- "zip1 v3.16b, v19.16b, v17.16b\n"
- "zip1 v2.16b, v18.16b, v16.16b\n"
- "ldr q28, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip2 v1.16b, v19.16b, v17.16b\n"
- "zip2 v27.16b, v18.16b, v16.16b\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v26.16b, v24.16b, v22.16b\n"
- "zip1 v25.16b, v23.16b, v21.16b\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip2 v24.16b, v24.16b, v22.16b\n"
- "zip2 v23.16b, v23.16b, v21.16b\n"
- "zip1 v22.16b, v30.16b, v28.16b\n"
- "zip1 v21.16b, v29.16b, v20.16b\n"
- "zip2 v0.16b, v30.16b, v28.16b\n"
- "zip2 v20.16b, v29.16b, v20.16b\n"
+ "ldr q21, [x17], #0x10\n"
+ "ldr q20, [x16], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q17, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
+ "zip1 v3.16b, v21.16b, v17.16b\n"
+ "zip1 v2.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x13], #0x10\n"
+ "ldr q18, [x12], #0x10\n"
+ "zip2 v1.16b, v21.16b, v17.16b\n"
+ "zip2 v0.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x11], #0x10\n"
+ "ldr q16, [x10], #0x10\n"
"zip1 v31.16b, v19.16b, v17.16b\n"
"zip1 v30.16b, v18.16b, v16.16b\n"
- "zip2 v29.16b, v19.16b, v17.16b\n"
- "zip2 v28.16b, v18.16b, v16.16b\n"
- "zip1 v19.16b, v3.16b, v2.16b\n"
- "zip2 v18.16b, v3.16b, v2.16b\n"
- "zip1 v17.16b, v1.16b, v27.16b\n"
- "zip2 v16.16b, v1.16b, v27.16b\n"
- "zip1 v27.16b, v26.16b, v25.16b\n"
- "zip2 v26.16b, v26.16b, v25.16b\n"
- "zip1 v25.16b, v24.16b, v23.16b\n"
+ "ldr q25, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v24.16b, v19.16b, v17.16b\n"
+ "zip2 v23.16b, v18.16b, v16.16b\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.16b, v25.16b, v17.16b\n"
+ "zip1 v21.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v29.16b, v25.16b, v17.16b\n"
+ "zip2 v20.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v28.16b, v19.16b, v17.16b\n"
+ "zip1 v27.16b, v18.16b, v16.16b\n"
+ "zip2 v26.16b, v19.16b, v17.16b\n"
+ "zip2 v25.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v3.16b, v2.16b\n"
+ "zip2 v17.16b, v3.16b, v2.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.16b, v1.16b, v0.16b\n"
+ "zip2 v19.16b, v1.16b, v0.16b\n"
+ "str q17, [x21, #0x10]\n"
+ "zip1 v18.16b, v31.16b, v30.16b\n"
+ "zip2 v17.16b, v31.16b, v30.16b\n"
+ "str q16, [x21, #0x20]\n"
+ "zip1 v16.16b, v24.16b, v23.16b\n"
"zip2 v24.16b, v24.16b, v23.16b\n"
- "str q19, [x15, #0x0]\n"
+ "str q19, [x21, #0x30]\n"
"zip1 v23.16b, v22.16b, v21.16b\n"
"zip2 v22.16b, v22.16b, v21.16b\n"
- "str q18, [x15, #0x10]\n"
- "zip1 v21.16b, v0.16b, v20.16b\n"
- "zip2 v20.16b, v0.16b, v20.16b\n"
- "str q17, [x15, #0x20]\n"
- "zip1 v19.16b, v31.16b, v30.16b\n"
- "zip2 v18.16b, v31.16b, v30.16b\n"
- "str q16, [x15, #0x30]\n"
- "zip1 v17.16b, v29.16b, v28.16b\n"
- "zip2 v16.16b, v29.16b, v28.16b\n"
- "str q27, [x15, #0x40]\n"
- "str q26, [x15, #0x50]\n"
- "str q25, [x15, #0x60]\n"
- "str q24, [x15, #0x70]\n"
- "str q23, [x15, #0x80]\n"
- "str q22, [x15, #0x90]\n"
- "str q21, [x15, #0xa0]\n"
- "str q20, [x15, #0xb0]\n"
- "str q19, [x15, #0xc0]\n"
- "str q18, [x15, #0xd0]\n"
- "str q17, [x15, #0xe0]\n"
- "str q16, [x15, #0xf0]\n"
- "add x15, x15, %x[out_stride]\n"
+ "str q18, [x21, #0x40]\n"
+ "zip1 v21.16b, v29.16b, v20.16b\n"
+ "zip2 v20.16b, v29.16b, v20.16b\n"
+ "str q17, [x21, #0x50]\n"
+ "zip1 v19.16b, v28.16b, v27.16b\n"
+ "zip2 v18.16b, v28.16b, v27.16b\n"
+ "str q16, [x21, #0x60]\n"
+ "zip1 v17.16b, v26.16b, v25.16b\n"
+ "zip2 v16.16b, v26.16b, v25.16b\n"
+ "str q24, [x21, #0x70]\n"
+ "str q23, [x21, #0x80]\n"
+ "str q22, [x21, #0x90]\n"
+ "str q21, [x21, #0xa0]\n"
+ "str q20, [x21, #0xb0]\n"
+ "str q19, [x21, #0xc0]\n"
+ "str q18, [x21, #0xd0]\n"
+ "str q17, [x21, #0xe0]\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x16, 8f\n"
- "cmp x16, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
- "str q16, [x15, #0x10]\n"
- "str q16, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "str q16, [x15, #0x40]\n"
- "str q16, [x15, #0x50]\n"
- "str q16, [x15, #0x60]\n"
- "str q16, [x15, #0x70]\n"
- "str q16, [x15, #0x80]\n"
- "str q16, [x15, #0x90]\n"
- "str q16, [x15, #0xa0]\n"
- "str q16, [x15, #0xb0]\n"
- "str q16, [x15, #0xc0]\n"
- "str q16, [x15, #0xd0]\n"
- "str q16, [x15, #0xe0]\n"
- "str q16, [x15, #0xf0]\n"
+ "cmp x24, #0x4\n"
"blt 5f\n"
"4:" // Main row loop: width 4 loop: loop
- "ldr s23, [x17], #0x4\n"
- "ldr s21, [x14], #0x4\n"
- "sub x16, x16, #0x4\n"
- "ldr s20, [x13], #0x4\n"
- "ldr s19, [x12], #0x4\n"
- "cmp x16, #0x4\n"
- "ldr s22, [x11], #0x4\n"
- "ldr s18, [x10], #0x4\n"
- "ldr s17, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s27, [x27], #0x4\n"
- "ldr s26, [x26], #0x4\n"
- "zip1 v25.16b, v23.16b, v20.16b\n"
- "zip1 v21.16b, v21.16b, v19.16b\n"
+ "ldr s19, [x17], #0x4\n"
+ "ldr s18, [x16], #0x4\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr s16, [x14], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr s19, [x13], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
+ "zip1 v22.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x11], #0x4\n"
+ "ldr s16, [x10], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "zip1 v21.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
"ldr s20, [x25], #0x4\n"
- "ldr s19, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s23, [x22], #0x4\n"
- "zip1 v22.16b, v22.16b, v17.16b\n"
- "zip1 v17.16b, v18.16b, v16.16b\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x22], #0x4\n"
"ldr s16, [x20], #0x4\n"
- "zip1 v21.16b, v25.16b, v21.16b\n"
- "zip1 v20.16b, v27.16b, v20.16b\n"
- "zip1 v19.16b, v26.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "str q21, [x15, #0x0]\n"
- "str q17, [x15, #0x40]\n"
- "zip1 v17.16b, v20.16b, v19.16b\n"
- "zip1 v16.16b, v18.16b, v16.16b\n"
- "str q17, [x15, #0x80]\n"
- "str q16, [x15, #0xc0]\n"
- "add x15, x15, #0x10\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str q22, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str q21, [x21, #0x40]\n"
+ "str q18, [x21, #0x80]\n"
+ "str q16, [x21, #0xc0]\n"
+ "add x21, x21, #0x10\n"
"bge 4b\n"
"5:" // Main row loop: width 4 loop: skip
- "cmp x16, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 7f\n"
"6:" // Main row loop: width 1 loop: loop
- "ldr b23, [x17], #0x1\n"
- "ldr b21, [x14], #0x1\n"
- "sub x16, x16, #0x1\n"
- "ldr b20, [x13], #0x1\n"
- "ldr b19, [x12], #0x1\n"
- "cmp x16, #0x1\n"
- "ldr b22, [x11], #0x1\n"
- "ldr b18, [x10], #0x1\n"
- "ldr b17, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b27, [x27], #0x1\n"
- "ldr b26, [x26], #0x1\n"
- "zip1 v25.16b, v23.16b, v20.16b\n"
- "zip1 v21.16b, v21.16b, v19.16b\n"
+ "ldr b19, [x17], #0x1\n"
+ "ldr b18, [x16], #0x1\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr b17, [x15], #0x1\n"
+ "ldr b16, [x14], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr b19, [x13], #0x1\n"
+ "ldr b18, [x12], #0x1\n"
+ "zip1 v22.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x11], #0x1\n"
+ "ldr b16, [x10], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr b19, [x9], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
+ "zip1 v21.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b16, [x26], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
"ldr b20, [x25], #0x1\n"
- "ldr b19, [x24], #0x1\n"
- "ldr b24, [x23], #0x1\n"
- "ldr b23, [x22], #0x1\n"
- "zip1 v22.16b, v22.16b, v17.16b\n"
- "zip1 v17.16b, v18.16b, v16.16b\n"
- "ldr b18, [x21], #0x1\n"
+ "ldr b19, [x23], #0x1\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x22], #0x1\n"
"ldr b16, [x20], #0x1\n"
- "zip1 v21.16b, v25.16b, v21.16b\n"
- "zip1 v20.16b, v27.16b, v20.16b\n"
- "zip1 v19.16b, v26.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "str s21, [x15, #0x0]\n"
- "str s17, [x15, #0x40]\n"
- "zip1 v17.16b, v20.16b, v19.16b\n"
- "zip1 v16.16b, v18.16b, v16.16b\n"
- "str s17, [x15, #0x80]\n"
- "str s16, [x15, #0xc0]\n"
- "add x15, x15, #0x4\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str s22, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str s21, [x21, #0x40]\n"
+ "str s18, [x21, #0x80]\n"
+ "str s16, [x21, #0xc0]\n"
+ "add x21, x21, #0x4\n"
"bge 6b\n"
"7:" // Main row loop: width 1 loop: skip
- "8:" // Main row loop: odd col skip
"cmp %x[height], #0x10\n"
"add %x[out], %x[out], #0x100\n"
"bge 1b\n"
- "cbz %x[height], 18f\n"
- "9:" // Main loop skip
- "10:" // Tail row loop: Head
+ "cbz %x[height], 16f\n"
+ "8:" // Main loop skip
+ "9:" // Tail row loop: Head
"mov x17, %x[in]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x14, x15, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x15, %x[out]\n"
- "add x14, x17, %x[in_stride]\n"
- "add x13, x14, %x[in_stride]\n"
- "add x12, x13, %x[in_stride]\n"
- "csel x13, x13, %x[pad_row], GE\n"
- "add %x[in], x12, %x[in_stride]\n"
- "csel x12, x12, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x14, %x[in_stride]\n"
"csel x14, x14, %x[pad_row], GT\n"
+ "csel x15, x15, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x16, x16, %x[pad_row], GT\n"
"cmp x20, #0x10\n"
- "blt 12f\n"
- "11:" // Tail row loop: Column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 11f\n"
+ "10:" // Tail row loop: Column loop
"ldr q20, [x17], #0x10\n"
- "ldr q21, [x14], #0x10\n"
+ "ldr q21, [x16], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q19, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
"cmp x20, #0x10\n"
+ "ldr q19, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
"zip1 v18.16b, v20.16b, v19.16b\n"
"zip1 v17.16b, v21.16b, v16.16b\n"
"zip2 v20.16b, v20.16b, v19.16b\n"
- "zip2 v16.16b, v21.16b, v16.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
+ "zip2 v19.16b, v21.16b, v16.16b\n"
+ "zip1 v16.16b, v18.16b, v17.16b\n"
"zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v20.16b, v16.16b\n"
- "zip2 v16.16b, v20.16b, v16.16b\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "str q17, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
- "bge 11b\n"
- "12:" // Tail row loop: Column loop skip
- "cbz x20, 17f\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v17.16b, v20.16b, v19.16b\n"
+ "zip2 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 10b\n"
+ "11:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
- "str q16, [x15, #0x10]\n"
- "str q16, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "blt 14f\n"
- "13:" // Tail row loop: width 4 loop: loop
+ "blt 13f\n"
+ "12:" // Tail row loop: width 4 loop: loop
"ldr s19, [x17], #0x4\n"
- "ldr s18, [x14], #0x4\n"
+ "ldr s18, [x16], #0x4\n"
"sub x20, x20, #0x4\n"
- "ldr s17, [x13], #0x4\n"
- "ldr s16, [x12], #0x4\n"
"cmp x20, #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr s16, [x14], #0x4\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "bge 13b\n"
- "14:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 1 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 1 loop: loop
"ldr b19, [x17], #0x1\n"
- "ldr b18, [x14], #0x1\n"
+ "ldr b18, [x16], #0x1\n"
"sub x20, x20, #0x1\n"
- "ldr b17, [x13], #0x1\n"
- "ldr b16, [x12], #0x1\n"
"cmp x20, #0x1\n"
+ "ldr b17, [x15], #0x1\n"
+ "ldr b16, [x14], #0x1\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str s16, [x15, #0x0]\n"
- "add x15, x15, #0x4\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 1 loop: skip
- "17:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x40\n"
- "bge 10b\n"
- "18:" // Done
+ "bge 9b\n"
+ "16:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
index 586696fcc5..93c95048a8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
@@ -41,232 +41,221 @@ void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t wid
__asm__ __volatile__(
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "cmp %x[height], #0x7\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GE\n"
- "add %x[in], x20, %x[in_stride]\n"
- "csel x20, x20, %x[pad_row], GT\n"
- "cmp %x[height], #0x5\n"
+ "cmp %x[height], #0x7\n"
+ "add %x[in], x22, %x[in_stride]\n"
"csel x22, x22, %x[pad_row], GT\n"
"csel x23, x23, %x[pad_row], GE\n"
- "cmp %x[height], #0x3\n"
+ "cmp %x[height], #0x5\n"
+ "mov x21, %x[width]\n"
"csel x24, x24, %x[pad_row], GT\n"
"csel x25, x25, %x[pad_row], GE\n"
+ "cmp %x[height], #0x3\n"
+ "csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
+ "cmp x21, #0x20\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x8\n"
- "csel x26, x26, %x[pad_row], GT\n"
- "cmp x28, #0x20\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q24, [x9], #0x10\n"
- "ldr q0, [x26], #0x10\n"
- "sub x28, x28, #0x20\n"
- "ldr q31, [x25], #0x10\n"
- "ldr q30, [x24], #0x10\n"
- "cmp x28, #0x20\n"
- "ldr q23, [x23], #0x10\n"
- "ldr q29, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "ldr q21, [x20], #0x10\n"
- "ldr q28, [x9], #0x10\n"
- "ldr q4, [x26], #0x10\n"
- "ldr q27, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip1 v3.16b, v24.16b, v23.16b\n"
- "zip1 v2.16b, v0.16b, v29.16b\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v26.16b, v31.16b, v22.16b\n"
- "zip1 v25.16b, v30.16b, v21.16b\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "zip2 v23.16b, v31.16b, v22.16b\n"
- "zip2 v22.16b, v0.16b, v29.16b\n"
- "zip2 v21.16b, v30.16b, v21.16b\n"
- "zip1 v0.16b, v28.16b, v19.16b\n"
- "zip1 v31.16b, v4.16b, v18.16b\n"
- "zip1 v30.16b, v27.16b, v17.16b\n"
- "zip1 v29.16b, v20.16b, v16.16b\n"
- "zip2 v1.16b, v28.16b, v19.16b\n"
- "zip2 v28.16b, v27.16b, v17.16b\n"
- "zip2 v27.16b, v4.16b, v18.16b\n"
- "zip2 v20.16b, v20.16b, v16.16b\n"
- "zip1 v19.16b, v3.16b, v26.16b\n"
- "zip1 v18.16b, v2.16b, v25.16b\n"
- "zip2 v17.16b, v3.16b, v26.16b\n"
- "zip2 v16.16b, v2.16b, v25.16b\n"
- "zip1 v26.16b, v24.16b, v23.16b\n"
- "zip1 v25.16b, v22.16b, v21.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q22, [x28], #0x10\n"
+ "sub x21, x21, #0x20\n"
+ "cmp x21, #0x20\n"
+ "ldr q20, [x27], #0x10\n"
+ "ldr q21, [x26], #0x10\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x24], #0x10\n"
+ "zip1 v5.16b, v23.16b, v19.16b\n"
+ "zip1 v4.16b, v22.16b, v18.16b\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ "zip1 v3.16b, v20.16b, v17.16b\n"
+ "zip1 v31.16b, v21.16b, v16.16b\n"
+ "ldr q25, [x9], #0x10\n"
+ "ldr q24, [x28], #0x10\n"
+ "zip2 v2.16b, v23.16b, v19.16b\n"
+ "zip2 v30.16b, v20.16b, v17.16b\n"
+ "ldr q23, [x27], #0x10\n"
+ "ldr q20, [x26], #0x10\n"
+ "zip2 v22.16b, v22.16b, v18.16b\n"
+ "zip2 v21.16b, v21.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x24], #0x10\n"
+ "zip1 v29.16b, v25.16b, v19.16b\n"
+ "zip1 v28.16b, v24.16b, v18.16b\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ "zip1 v27.16b, v23.16b, v17.16b\n"
+ "zip1 v26.16b, v20.16b, v16.16b\n"
+ "zip2 v1.16b, v25.16b, v19.16b\n"
+ "zip2 v25.16b, v23.16b, v17.16b\n"
+ "zip2 v24.16b, v24.16b, v18.16b\n"
+ "zip2 v16.16b, v20.16b, v16.16b\n"
+ "zip1 v0.16b, v5.16b, v3.16b\n"
+ "zip1 v17.16b, v4.16b, v31.16b\n"
+ "zip2 v20.16b, v5.16b, v3.16b\n"
+ "zip2 v19.16b, v4.16b, v31.16b\n"
+ "zip1 v31.16b, v2.16b, v30.16b\n"
+ "zip1 v18.16b, v22.16b, v21.16b\n"
+ "zip2 v30.16b, v2.16b, v30.16b\n"
"zip2 v23.16b, v22.16b, v21.16b\n"
- "zip1 v22.16b, v0.16b, v30.16b\n"
- "zip1 v21.16b, v31.16b, v29.16b\n"
- "zip2 v0.16b, v0.16b, v30.16b\n"
- "zip2 v31.16b, v31.16b, v29.16b\n"
- "zip1 v30.16b, v1.16b, v28.16b\n"
- "zip1 v29.16b, v27.16b, v20.16b\n"
- "zip2 v28.16b, v1.16b, v28.16b\n"
- "zip2 v27.16b, v27.16b, v20.16b\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "zip1 v18.16b, v17.16b, v16.16b\n"
- "zip2 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v26.16b, v25.16b\n"
- "zip2 v26.16b, v26.16b, v25.16b\n"
- "zip1 v25.16b, v24.16b, v23.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "str q20, [x27, #0x0]\n"
- "str q19, [x27, #0x10]\n"
+ "zip1 v22.16b, v29.16b, v27.16b\n"
+ "zip1 v21.16b, v28.16b, v26.16b\n"
+ "zip2 v29.16b, v29.16b, v27.16b\n"
+ "zip2 v28.16b, v28.16b, v26.16b\n"
+ "zip1 v27.16b, v1.16b, v25.16b\n"
+ "zip1 v26.16b, v24.16b, v16.16b\n"
+ "zip2 v25.16b, v1.16b, v25.16b\n"
+ "zip2 v24.16b, v24.16b, v16.16b\n"
+ "zip1 v16.16b, v0.16b, v17.16b\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "str q16, [x20, #0x0]\n"
+ "zip1 v16.16b, v20.16b, v19.16b\n"
+ "zip2 v20.16b, v20.16b, v19.16b\n"
+ "str q17, [x20, #0x10]\n"
+ "zip1 v19.16b, v31.16b, v18.16b\n"
+ "zip2 v18.16b, v31.16b, v18.16b\n"
+ "str q16, [x20, #0x20]\n"
+ "zip1 v17.16b, v30.16b, v23.16b\n"
+ "zip2 v16.16b, v30.16b, v23.16b\n"
+ "str q20, [x20, #0x30]\n"
+ "str q19, [x20, #0x40]\n"
"zip1 v23.16b, v22.16b, v21.16b\n"
"zip2 v22.16b, v22.16b, v21.16b\n"
- "str q18, [x27, #0x20]\n"
- "zip1 v21.16b, v0.16b, v31.16b\n"
- "zip2 v20.16b, v0.16b, v31.16b\n"
- "str q17, [x27, #0x30]\n"
- "zip1 v19.16b, v30.16b, v29.16b\n"
- "zip2 v18.16b, v30.16b, v29.16b\n"
- "str q16, [x27, #0x40]\n"
- "zip1 v17.16b, v28.16b, v27.16b\n"
- "zip2 v16.16b, v28.16b, v27.16b\n"
- "str q26, [x27, #0x50]\n"
- "str q25, [x27, #0x60]\n"
- "str q24, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q23, [x27, #0x0]\n"
- "str q22, [x27, #0x10]\n"
- "str q21, [x27, #0x20]\n"
- "str q20, [x27, #0x30]\n"
- "str q19, [x27, #0x40]\n"
- "str q18, [x27, #0x50]\n"
- "str q17, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q18, [x20, #0x50]\n"
+ "zip1 v21.16b, v29.16b, v28.16b\n"
+ "zip2 v20.16b, v29.16b, v28.16b\n"
+ "str q17, [x20, #0x60]\n"
+ "zip1 v19.16b, v27.16b, v26.16b\n"
+ "zip2 v18.16b, v27.16b, v26.16b\n"
+ "str q16, [x20, #0x70]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip1 v17.16b, v25.16b, v24.16b\n"
+ "zip2 v16.16b, v25.16b, v24.16b\n"
+ "str q23, [x20, #0x0]\n"
+ "str q22, [x20, #0x10]\n"
+ "str q21, [x20, #0x20]\n"
+ "str q20, [x20, #0x30]\n"
+ "str q19, [x20, #0x40]\n"
+ "str q18, [x20, #0x50]\n"
+ "str q17, [x20, #0x60]\n"
+ "str q16, [x20, #0x70]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0x10\n"
+ "cmp x21, #0x10\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q24, [x9], #0x10\n"
- "ldr q25, [x26], #0x10\n"
- "sub x28, x28, #0x10\n"
- "ldr q23, [x25], #0x10\n"
- "ldr q22, [x24], #0x10\n"
- "cmp x28, #0x10\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v20.16b, v24.16b, v18.16b\n"
- "zip1 v19.16b, v25.16b, v17.16b\n"
- "zip2 v24.16b, v24.16b, v18.16b\n"
- "zip2 v25.16b, v25.16b, v17.16b\n"
- "zip1 v18.16b, v23.16b, v21.16b\n"
- "zip1 v17.16b, v22.16b, v16.16b\n"
- "zip2 v23.16b, v23.16b, v21.16b\n"
- "zip2 v16.16b, v22.16b, v16.16b\n"
- "zip1 v22.16b, v20.16b, v18.16b\n"
- "zip1 v21.16b, v19.16b, v17.16b\n"
- "zip2 v20.16b, v20.16b, v18.16b\n"
- "zip2 v19.16b, v19.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v23.16b\n"
- "zip1 v17.16b, v25.16b, v16.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "zip2 v16.16b, v25.16b, v16.16b\n"
- "zip1 v23.16b, v22.16b, v21.16b\n"
- "zip2 v22.16b, v22.16b, v21.16b\n"
- "zip1 v21.16b, v20.16b, v19.16b\n"
- "zip2 v20.16b, v20.16b, v19.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
- "zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v24.16b, v16.16b\n"
+ "ldr q25, [x9], #0x10\n"
+ "ldr q27, [x28], #0x10\n"
+ "sub x21, x21, #0x10\n"
+ "cmp x21, #0x10\n"
+ "ldr q26, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "ldr q22, [x25], #0x10\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip1 v20.16b, v25.16b, v22.16b\n"
+ "zip1 v23.16b, v27.16b, v21.16b\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ "zip1 v19.16b, v26.16b, v17.16b\n"
+ "zip1 v18.16b, v24.16b, v16.16b\n"
+ "zip2 v25.16b, v25.16b, v22.16b\n"
+ "zip2 v22.16b, v26.16b, v17.16b\n"
+ "zip2 v21.16b, v27.16b, v21.16b\n"
"zip2 v16.16b, v24.16b, v16.16b\n"
- "str q23, [x27, #0x0]\n"
- "str q22, [x27, #0x10]\n"
- "str q21, [x27, #0x20]\n"
- "str q20, [x27, #0x30]\n"
- "str q19, [x27, #0x40]\n"
- "str q18, [x27, #0x50]\n"
- "str q17, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 v24.16b, v20.16b, v19.16b\n"
+ "zip1 v17.16b, v23.16b, v18.16b\n"
+ "zip2 v20.16b, v20.16b, v19.16b\n"
+ "zip2 v19.16b, v23.16b, v18.16b\n"
+ "zip1 v23.16b, v25.16b, v22.16b\n"
+ "zip1 v18.16b, v21.16b, v16.16b\n"
+ "zip2 v22.16b, v25.16b, v22.16b\n"
+ "zip2 v21.16b, v21.16b, v16.16b\n"
+ "zip1 v16.16b, v24.16b, v17.16b\n"
+ "zip2 v17.16b, v24.16b, v17.16b\n"
+ "str q16, [x20, #0x0]\n"
+ "zip1 v16.16b, v20.16b, v19.16b\n"
+ "zip2 v20.16b, v20.16b, v19.16b\n"
+ "str q17, [x20, #0x10]\n"
+ "zip1 v19.16b, v23.16b, v18.16b\n"
+ "zip2 v18.16b, v23.16b, v18.16b\n"
+ "str q16, [x20, #0x20]\n"
+ "zip1 v17.16b, v22.16b, v21.16b\n"
+ "zip2 v16.16b, v22.16b, v21.16b\n"
+ "str q20, [x20, #0x30]\n"
+ "str q19, [x20, #0x40]\n"
+ "str q18, [x20, #0x50]\n"
+ "str q17, [x20, #0x60]\n"
+ "str q16, [x20, #0x70]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
+ "cmp x21, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "sub x28, x28, #0x4\n"
- "ldr s21, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "cmp x28, #0x4\n"
- "ldr s18, [x23], #0x4\n"
- "ldr s19, [x22], #0x4\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v18.16b, v23.16b, v18.16b\n"
- "zip1 v19.16b, v22.16b, v19.16b\n"
+ "ldr s18, [x9], #0x4\n"
+ "ldr s19, [x28], #0x4\n"
+ "sub x21, x21, #0x4\n"
+ "cmp x21, #0x4\n"
+ "ldr s21, [x27], #0x4\n"
+ "ldr s20, [x26], #0x4\n"
+ "ldr s17, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "zip1 v18.16b, v18.16b, v17.16b\n"
+ "zip1 v19.16b, v19.16b, v16.16b\n"
+ "ldr s17, [x23], #0x4\n"
+ "ldr s16, [x22], #0x4\n"
"zip1 v17.16b, v21.16b, v17.16b\n"
"zip1 v16.16b, v20.16b, v16.16b\n"
"zip1 v18.16b, v18.16b, v17.16b\n"
"zip1 v16.16b, v19.16b, v16.16b\n"
"zip1 v17.16b, v18.16b, v16.16b\n"
"zip2 v16.16b, v18.16b, v16.16b\n"
- "str q17, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
+ "str q17, [x20, #0x0]\n"
+ "str q16, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x21, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "sub x28, x28, #0x1\n"
- "ldr b21, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "cmp x28, #0x1\n"
- "ldr b19, [x23], #0x1\n"
- "ldr b18, [x22], #0x1\n"
- "ldr b17, [x21], #0x1\n"
- "ldr b16, [x20], #0x1\n"
- "zip1 v19.16b, v23.16b, v19.16b\n"
- "zip1 v18.16b, v22.16b, v18.16b\n"
+ "ldr b19, [x9], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
+ "sub x21, x21, #0x1\n"
+ "cmp x21, #0x1\n"
+ "ldr b21, [x27], #0x1\n"
+ "ldr b20, [x26], #0x1\n"
+ "ldr b17, [x25], #0x1\n"
+ "ldr b16, [x24], #0x1\n"
+ "zip1 v19.16b, v19.16b, v17.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "ldr b17, [x23], #0x1\n"
+ "ldr b16, [x22], #0x1\n"
"zip1 v17.16b, v21.16b, v17.16b\n"
"zip1 v16.16b, v20.16b, v16.16b\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
+ "str d16, [x20, #0x0]\n"
+ "add x20, x20, #0x8\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x80\n"
"bge 1b\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
index 8186b1f475..b1efe81b35 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
@@ -40,215 +40,189 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 9f\n"
+ "blt 8f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x10\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x10\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
- "ldr q22, [x9], #0x10\n"
- "ldr q21, [x26], #0x10\n"
- "sub x28, x28, #0x10\n"
- "ldr q20, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "cmp x28, #0x10\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q23, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v0.8h, v22.8h, v21.8h\n"
- "zip2 v31.8h, v22.8h, v21.8h\n"
- "ldr q22, [x9], #0x10\n"
- "ldr q21, [x26], #0x10\n"
- "zip1 v30.8h, v20.8h, v19.8h\n"
- "zip2 v29.8h, v20.8h, v19.8h\n"
- "ldr q20, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "zip1 v28.8h, v18.8h, v17.8h\n"
- "zip2 v27.8h, v18.8h, v17.8h\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "zip1 v26.8h, v23.8h, v16.8h\n"
- "zip2 v25.8h, v23.8h, v16.8h\n"
- "ldr q24, [x21], #0x10\n"
+ "ldr q17, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q19, [x27], #0x10\n"
+ "ldr q18, [x26], #0x10\n"
+ "zip1 v1.8h, v17.8h, v16.8h\n"
+ "zip2 v0.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v31.8h, v19.8h, v18.8h\n"
+ "zip2 v30.8h, v19.8h, v18.8h\n"
+ "ldr q29, [x22], #0x10\n"
+ "ldr q18, [x20], #0x10\n"
+ "zip1 v28.8h, v17.8h, v16.8h\n"
+ "zip2 v27.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
+ "zip1 v26.8h, v17.8h, v16.8h\n"
+ "zip2 v25.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v24.8h, v17.8h, v16.8h\n"
+ "zip2 v23.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v22.8h, v17.8h, v16.8h\n"
+ "zip2 v21.8h, v17.8h, v16.8h\n"
+ "ldr q20, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip1 v23.8h, v22.8h, v21.8h\n"
- "zip2 v22.8h, v22.8h, v21.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "str q0, [x27, #0x0]\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "str q31, [x27, #0x10]\n"
- "zip1 v17.8h, v24.8h, v16.8h\n"
- "zip2 v16.8h, v24.8h, v16.8h\n"
- "str q23, [x27, #0x20]\n"
- "str q22, [x27, #0x30]\n"
- "str q30, [x27, #0x40]\n"
- "str q29, [x27, #0x50]\n"
- "str q21, [x27, #0x60]\n"
- "str q20, [x27, #0x70]\n"
- "str q28, [x27, #0x80]\n"
- "str q27, [x27, #0x90]\n"
- "str q19, [x27, #0xa0]\n"
- "str q18, [x27, #0xb0]\n"
- "str q26, [x27, #0xc0]\n"
- "str q25, [x27, #0xd0]\n"
- "str q17, [x27, #0xe0]\n"
- "str q16, [x27, #0xf0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 v19.8h, v29.8h, v18.8h\n"
+ "zip2 v18.8h, v29.8h, v18.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
+ "str q1, [x21, #0x0]\n"
+ "str q0, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q25, [x21, #0x30]\n"
+ "str q31, [x21, #0x40]\n"
+ "str q30, [x21, #0x50]\n"
+ "str q24, [x21, #0x60]\n"
+ "str q23, [x21, #0x70]\n"
+ "str q28, [x21, #0x80]\n"
+ "str q27, [x21, #0x90]\n"
+ "str q22, [x21, #0xa0]\n"
+ "str q21, [x21, #0xb0]\n"
+ "str q19, [x21, #0xc0]\n"
+ "str q18, [x21, #0xd0]\n"
+ "str q17, [x21, #0xe0]\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x28, 8f\n"
- "cmp x28, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "str q16, [x27, #0xc0]\n"
- "str q16, [x27, #0xd0]\n"
- "str q16, [x27, #0xe0]\n"
- "str q16, [x27, #0xf0]\n"
+ "cmp x24, #0x4\n"
"blt 5f\n"
"4:" // Main row loop: width 4 loop: loop
- "ldr d23, [x9], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "sub x28, x28, #0x4\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d16, [x24], #0x8\n"
- "cmp x28, #0x4\n"
- "ldr d21, [x23], #0x8\n"
+ "ldr d19, [x9], #0x8\n"
+ "ldr d18, [x28], #0x8\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v20.8h, v19.8h, v18.8h\n"
+ "zip1 v19.8h, v17.8h, v16.8h\n"
+ "ldr d17, [x25], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
"ldr d17, [x22], #0x8\n"
- "ldr d20, [x21], #0x8\n"
- "ldr d19, [x20], #0x8\n"
- "zip1 v18.8h, v23.8h, v18.8h\n"
- "zip1 v16.8h, v22.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v17.8h\n"
- "str q18, [x27, #0x0]\n"
- "str q16, [x27, #0x40]\n"
- "zip1 v16.8h, v20.8h, v19.8h\n"
- "str q17, [x27, #0x80]\n"
- "str q16, [x27, #0xc0]\n"
- "add x27, x27, #0x10\n"
+ "ldr d16, [x20], #0x8\n"
+ "str q20, [x21, #0x0]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x80]\n"
+ "str q16, [x21, #0xc0]\n"
+ "add x21, x21, #0x10\n"
"bge 4b\n"
"5:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 7f\n"
"6:" // Main row loop: width 1 loop: loop
- "ldr h23, [x9], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "sub x28, x28, #0x1\n"
- "ldr h22, [x25], #0x2\n"
- "ldr h16, [x24], #0x2\n"
- "cmp x28, #0x1\n"
- "ldr h21, [x23], #0x2\n"
+ "ldr h19, [x9], #0x2\n"
+ "ldr h18, [x28], #0x2\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h16, [x26], #0x2\n"
+ "zip1 v20.8h, v19.8h, v18.8h\n"
+ "zip1 v19.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x25], #0x2\n"
+ "ldr h16, [x23], #0x2\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
"ldr h17, [x22], #0x2\n"
- "ldr h20, [x21], #0x2\n"
- "ldr h19, [x20], #0x2\n"
- "zip1 v18.8h, v23.8h, v18.8h\n"
- "zip1 v16.8h, v22.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v17.8h\n"
- "str s18, [x27, #0x0]\n"
- "str s16, [x27, #0x40]\n"
- "zip1 v16.8h, v20.8h, v19.8h\n"
- "str s17, [x27, #0x80]\n"
- "str s16, [x27, #0xc0]\n"
- "add x27, x27, #0x4\n"
+ "ldr h16, [x20], #0x2\n"
+ "str s20, [x21, #0x0]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str s19, [x21, #0x40]\n"
+ "str s18, [x21, #0x80]\n"
+ "str s16, [x21, #0xc0]\n"
+ "add x21, x21, #0x4\n"
"bge 6b\n"
"7:" // Main row loop: width 1 loop: skip
- "8:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0x100\n"
"bge 1b\n"
- "cbz %x[height], 18f\n"
- "9:" // Main loop skip
- "10:" // Tail row loop: Head
+ "cbz %x[height], 16f\n"
+ "8:" // Main loop skip
+ "9:" // Tail row loop: Head
"mov x9, %x[in]\n"
"mov x20, %x[width]\n"
+ "add x28, x9, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x26, x9, %x[in_stride]\n"
- "add %x[in], x26, %x[in_stride]\n"
- "csel x26, x26, %x[pad_row], GT\n"
+ "add %x[in], x28, %x[in_stride]\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x10\n"
- "blt 12f\n"
- "11:" // Tail row loop: Column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "blt 11f\n"
+ "10:" // Tail row loop: Column loop
"ldr q18, [x9], #0x10\n"
- "ldr q17, [x26], #0x10\n"
+ "ldr q17, [x28], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q20, [x9], #0x10\n"
"cmp x20, #0x10\n"
- "ldr q16, [x26], #0x10\n"
+ "ldr q20, [x9], #0x10\n"
+ "ldr q16, [x28], #0x10\n"
"zip1 v19.8h, v18.8h, v17.8h\n"
"zip2 v18.8h, v18.8h, v17.8h\n"
"zip1 v17.8h, v20.8h, v16.8h\n"
"zip2 v16.8h, v20.8h, v16.8h\n"
- "str q19, [x27, #0x0]\n"
- "str q18, [x27, #0x10]\n"
- "str q17, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 11b\n"
- "12:" // Tail row loop: Column loop skip
- "cbz x20, 17f\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 10b\n"
+ "11:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "blt 14f\n"
- "13:" // Tail row loop: width 4 loop: loop
+ "blt 13f\n"
+ "12:" // Tail row loop: width 4 loop: loop
"ldr d17, [x9], #0x8\n"
- "ldr d16, [x26], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str q16, [x27, #0x0]\n"
- "add x27, x27, #0x10\n"
- "bge 13b\n"
- "14:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 1 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 1 loop: loop
"ldr h17, [x9], #0x2\n"
- "ldr h16, [x26], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str s16, [x27, #0x0]\n"
- "add x27, x27, #0x4\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 1 loop: skip
- "17:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x40\n"
- "bge 10b\n"
- "18:" // Done
+ "bge 9b\n"
+ "16:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
index 1ed8708f4f..02ae1ade30 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
@@ -40,481 +40,451 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x20\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x20\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q0, [x9], #0x10\n"
- "ldr q18, [x26], #0x10\n"
- "sub x28, x28, #0x20\n"
- "ldr q22, [x25], #0x10\n"
- "ldr q27, [x24], #0x10\n"
- "cmp x28, #0x20\n"
- "ldr q29, [x23], #0x10\n"
- "ldr q12, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q5, [x20], #0x10\n"
- "ldr q7, [x9], #0x10\n"
- "ldr q16, [x26], #0x10\n"
- "zip1 v26.8h, v0.8h, v22.8h\n"
- "zip1 v3.8h, v18.8h, v27.8h\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q28, [x24], #0x10\n"
- "zip2 v25.8h, v0.8h, v22.8h\n"
- "zip2 v10.8h, v18.8h, v27.8h\n"
- "ldr q1, [x23], #0x10\n"
- "ldr q20, [x22], #0x10\n"
- "zip1 v14.8h, v29.8h, v17.8h\n"
- "zip1 v19.8h, v12.8h, v5.8h\n"
- "ldr q0, [x21], #0x10\n"
- "ldr q13, [x20], #0x10\n"
- "zip2 v22.8h, v29.8h, v17.8h\n"
- "zip2 v15.8h, v12.8h, v5.8h\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q29, [x28], #0x10\n"
+ "sub x24, x24, #0x20\n"
+ "cmp x24, #0x20\n"
+ "ldr q13, [x27], #0x10\n"
+ "ldr q12, [x26], #0x10\n"
+ "zip1 v20.8h, v23.8h, v13.8h\n"
+ "zip1 v28.8h, v29.8h, v12.8h\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q9, [x23], #0x10\n"
+ "zip2 v22.8h, v23.8h, v13.8h\n"
+ "zip2 v1.8h, v29.8h, v12.8h\n"
+ "ldr q27, [x22], #0x10\n"
+ "ldr q3, [x20], #0x10\n"
+ "zip1 v4.8h, v18.8h, v27.8h\n"
+ "zip1 v26.8h, v9.8h, v3.8h\n"
"ldr q17, [x9], #0x10\n"
- "ldr q23, [x26], #0x10\n"
- "zip1 v18.8h, v7.8h, v21.8h\n"
- "zip1 v5.8h, v16.8h, v28.8h\n"
- "ldr q31, [x25], #0x10\n"
- "ldr q12, [x24], #0x10\n"
- "zip2 v24.8h, v7.8h, v21.8h\n"
- "zip2 v28.8h, v16.8h, v28.8h\n"
- "ldr q4, [x23], #0x10\n"
- "ldr q6, [x22], #0x10\n"
- "zip1 v30.8h, v1.8h, v0.8h\n"
- "zip1 v21.8h, v20.8h, v13.8h\n"
- "ldr q7, [x21], #0x10\n"
- "ldr q9, [x20], #0x10\n"
- "zip2 v27.8h, v1.8h, v0.8h\n"
- "zip2 v0.8h, v20.8h, v13.8h\n"
- "ldr q2, [x9], #0x10\n"
- "ldr q8, [x26], #0x10\n"
- "zip1 v29.8h, v17.8h, v31.8h\n"
- "zip1 v16.8h, v23.8h, v12.8h\n"
- "ldr q13, [x25], #0x10\n"
- "ldr q1, [x24], #0x10\n"
- "zip2 v20.8h, v17.8h, v31.8h\n"
- "zip2 v23.8h, v23.8h, v12.8h\n"
- "ldr q17, [x23], #0x10\n"
- "ldr q31, [x22], #0x10\n"
- "zip1 v12.8h, v4.8h, v7.8h\n"
- "zip1 v11.8h, v6.8h, v9.8h\n"
- "zip2 v7.8h, v4.8h, v7.8h\n"
- "ldr q4, [x21], #0x10\n"
- "zip2 v6.8h, v6.8h, v9.8h\n"
- "zip1 v9.8h, v2.8h, v13.8h\n"
- "zip2 v13.8h, v2.8h, v13.8h\n"
- "zip1 v2.8h, v8.8h, v1.8h\n"
- "zip2 v1.8h, v8.8h, v1.8h\n"
- "zip1 v8.8h, v17.8h, v4.8h\n"
- "zip2 v4.8h, v17.8h, v4.8h\n"
- "zip1 v17.8h, v26.8h, v3.8h\n"
- "zip2 v26.8h, v26.8h, v3.8h\n"
+ "ldr q2, [x28], #0x10\n"
+ "zip2 v15.8h, v18.8h, v27.8h\n"
+ "zip2 v12.8h, v9.8h, v3.8h\n"
+ "ldr q23, [x27], #0x10\n"
+ "ldr q14, [x26], #0x10\n"
+ "zip1 v19.8h, v17.8h, v23.8h\n"
+ "zip1 v21.8h, v2.8h, v14.8h\n"
+ "ldr q6, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v27.8h, v17.8h, v23.8h\n"
+ "zip2 v17.8h, v2.8h, v14.8h\n"
+ "ldr q0, [x22], #0x10\n"
"ldr q3, [x20], #0x10\n"
- "str q17, [x27, #0x0]\n"
- "zip1 v17.8h, v25.8h, v10.8h\n"
- "zip2 v25.8h, v25.8h, v10.8h\n"
- "zip1 v10.8h, v18.8h, v5.8h\n"
- "zip2 v5.8h, v18.8h, v5.8h\n"
- "zip1 v18.8h, v31.8h, v3.8h\n"
- "zip2 v3.8h, v31.8h, v3.8h\n"
- "str q26, [x27, #0x10]\n"
- "zip1 v31.8h, v24.8h, v28.8h\n"
- "zip2 v26.8h, v24.8h, v28.8h\n"
- "str q17, [x27, #0x20]\n"
- "zip1 v28.8h, v14.8h, v19.8h\n"
- "zip2 v19.8h, v14.8h, v19.8h\n"
- "str q25, [x27, #0x30]\n"
- "zip1 v24.8h, v22.8h, v15.8h\n"
- "zip2 v15.8h, v22.8h, v15.8h\n"
- "str q10, [x27, #0x40]\n"
- "zip1 v25.8h, v30.8h, v21.8h\n"
- "zip2 v14.8h, v30.8h, v21.8h\n"
- "str q5, [x27, #0x50]\n"
- "zip1 v5.8h, v27.8h, v0.8h\n"
- "zip2 v0.8h, v27.8h, v0.8h\n"
- "str q31, [x27, #0x60]\n"
- "str q26, [x27, #0x70]\n"
- "zip1 v31.8h, v29.8h, v16.8h\n"
- "zip2 v22.8h, v29.8h, v16.8h\n"
- "str q28, [x27, #0x80]\n"
- "zip1 v10.8h, v20.8h, v23.8h\n"
- "zip2 v28.8h, v20.8h, v23.8h\n"
- "str q19, [x27, #0x90]\n"
- "zip1 v27.8h, v9.8h, v2.8h\n"
- "zip2 v20.8h, v9.8h, v2.8h\n"
- "str q24, [x27, #0xa0]\n"
- "zip1 v2.8h, v13.8h, v1.8h\n"
- "zip2 v24.8h, v13.8h, v1.8h\n"
- "str q15, [x27, #0xb0]\n"
- "zip1 v23.8h, v12.8h, v11.8h\n"
- "zip2 v26.8h, v12.8h, v11.8h\n"
- "str q25, [x27, #0xc0]\n"
- "zip1 v21.8h, v7.8h, v6.8h\n"
- "zip2 v29.8h, v7.8h, v6.8h\n"
- "str q14, [x27, #0xd0]\n"
- "zip1 v19.8h, v8.8h, v18.8h\n"
- "zip2 v18.8h, v8.8h, v18.8h\n"
- "str q5, [x27, #0xe0]\n"
- "zip1 v17.8h, v4.8h, v3.8h\n"
- "zip2 v16.8h, v4.8h, v3.8h\n"
- "str q0, [x27, #0xf0]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q31, [x27, #0x0]\n"
- "str q22, [x27, #0x10]\n"
- "str q10, [x27, #0x20]\n"
- "str q28, [x27, #0x30]\n"
- "str q27, [x27, #0x40]\n"
- "str q20, [x27, #0x50]\n"
- "str q2, [x27, #0x60]\n"
- "str q24, [x27, #0x70]\n"
- "str q23, [x27, #0x80]\n"
- "str q26, [x27, #0x90]\n"
- "str q21, [x27, #0xa0]\n"
- "str q29, [x27, #0xb0]\n"
- "str q19, [x27, #0xc0]\n"
- "str q18, [x27, #0xd0]\n"
- "str q17, [x27, #0xe0]\n"
- "str q16, [x27, #0xf0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 v16.8h, v6.8h, v0.8h\n"
+ "zip1 v30.8h, v18.8h, v3.8h\n"
+ "ldr q2, [x9], #0x10\n"
+ "ldr q13, [x28], #0x10\n"
+ "zip2 v31.8h, v6.8h, v0.8h\n"
+ "zip2 v8.8h, v18.8h, v3.8h\n"
+ "ldr q14, [x27], #0x10\n"
+ "ldr q3, [x26], #0x10\n"
+ "zip1 v11.8h, v2.8h, v14.8h\n"
+ "zip1 v29.8h, v13.8h, v3.8h\n"
+ "ldr q25, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v23.8h, v2.8h, v14.8h\n"
+ "zip2 v10.8h, v13.8h, v3.8h\n"
+ "ldr q7, [x22], #0x10\n"
+ "ldr q6, [x20], #0x10\n"
+ "zip1 v14.8h, v25.8h, v7.8h\n"
+ "zip1 v13.8h, v18.8h, v6.8h\n"
+ "ldr q2, [x9], #0x10\n"
+ "ldr q5, [x28], #0x10\n"
+ "zip2 v9.8h, v25.8h, v7.8h\n"
+ "zip2 v7.8h, v18.8h, v6.8h\n"
+ "ldr q6, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v25.8h, v2.8h, v6.8h\n"
+ "zip1 v3.8h, v5.8h, v24.8h\n"
+ "ldr q0, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v2.8h, v2.8h, v6.8h\n"
+ "zip2 v24.8h, v5.8h, v24.8h\n"
+ "ldr q5, [x22], #0x10\n"
+ "zip1 v6.8h, v0.8h, v5.8h\n"
+ "zip2 v5.8h, v0.8h, v5.8h\n"
+ "zip1 v0.8h, v20.8h, v28.8h\n"
+ "zip2 v28.8h, v20.8h, v28.8h\n"
+ "ldr q20, [x20], #0x10\n"
+ "str q0, [x21, #0x0]\n"
+ "zip1 v0.8h, v18.8h, v20.8h\n"
+ "zip2 v20.8h, v18.8h, v20.8h\n"
+ "str q28, [x21, #0x10]\n"
+ "zip1 v18.8h, v22.8h, v1.8h\n"
+ "zip2 v28.8h, v22.8h, v1.8h\n"
+ "str q18, [x21, #0x20]\n"
+ "zip1 v22.8h, v19.8h, v21.8h\n"
+ "zip2 v19.8h, v19.8h, v21.8h\n"
+ "str q28, [x21, #0x30]\n"
+ "zip1 v18.8h, v27.8h, v17.8h\n"
+ "zip2 v17.8h, v27.8h, v17.8h\n"
+ "str q22, [x21, #0x40]\n"
+ "zip1 v27.8h, v4.8h, v26.8h\n"
+ "zip2 v26.8h, v4.8h, v26.8h\n"
+ "str q19, [x21, #0x50]\n"
+ "zip1 v22.8h, v15.8h, v12.8h\n"
+ "zip2 v21.8h, v15.8h, v12.8h\n"
+ "str q18, [x21, #0x60]\n"
+ "zip1 v19.8h, v16.8h, v30.8h\n"
+ "zip2 v18.8h, v16.8h, v30.8h\n"
+ "str q17, [x21, #0x70]\n"
+ "zip1 v17.8h, v31.8h, v8.8h\n"
+ "zip2 v16.8h, v31.8h, v8.8h\n"
+ "str q27, [x21, #0x80]\n"
+ "str q26, [x21, #0x90]\n"
+ "zip1 v31.8h, v11.8h, v29.8h\n"
+ "zip2 v30.8h, v11.8h, v29.8h\n"
+ "str q22, [x21, #0xa0]\n"
+ "zip1 v29.8h, v23.8h, v10.8h\n"
+ "zip2 v28.8h, v23.8h, v10.8h\n"
+ "str q21, [x21, #0xb0]\n"
+ "zip1 v27.8h, v25.8h, v3.8h\n"
+ "zip2 v26.8h, v25.8h, v3.8h\n"
+ "str q19, [x21, #0xc0]\n"
+ "zip1 v25.8h, v2.8h, v24.8h\n"
+ "zip2 v24.8h, v2.8h, v24.8h\n"
+ "str q18, [x21, #0xd0]\n"
+ "zip1 v23.8h, v14.8h, v13.8h\n"
+ "zip2 v22.8h, v14.8h, v13.8h\n"
+ "str q17, [x21, #0xe0]\n"
+ "zip1 v21.8h, v9.8h, v7.8h\n"
+ "zip2 v19.8h, v9.8h, v7.8h\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip1 v2.8h, v6.8h, v0.8h\n"
+ "zip2 v18.8h, v6.8h, v0.8h\n"
+ "zip1 v17.8h, v5.8h, v20.8h\n"
+ "zip2 v16.8h, v5.8h, v20.8h\n"
+ "str q31, [x21, #0x0]\n"
+ "str q30, [x21, #0x10]\n"
+ "str q29, [x21, #0x20]\n"
+ "str q28, [x21, #0x30]\n"
+ "str q27, [x21, #0x40]\n"
+ "str q26, [x21, #0x50]\n"
+ "str q25, [x21, #0x60]\n"
+ "str q24, [x21, #0x70]\n"
+ "str q23, [x21, #0x80]\n"
+ "str q22, [x21, #0x90]\n"
+ "str q21, [x21, #0xa0]\n"
+ "str q19, [x21, #0xb0]\n"
+ "str q2, [x21, #0xc0]\n"
+ "str q18, [x21, #0xd0]\n"
+ "str q17, [x21, #0xe0]\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x28, #0x10\n"
+ "cmp x24, #0x10\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q19, [x9], #0x10\n"
- "ldr q18, [x26], #0x10\n"
- "sub x28, x28, #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "cmp x28, #0x10\n"
- "ldr q31, [x23], #0x10\n"
- "ldr q30, [x22], #0x10\n"
- "ldr q23, [x21], #0x10\n"
- "ldr q22, [x20], #0x10\n"
- "ldr q29, [x9], #0x10\n"
- "ldr q28, [x26], #0x10\n"
- "zip1 v27.8h, v19.8h, v17.8h\n"
- "zip1 v26.8h, v18.8h, v16.8h\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip2 v25.8h, v19.8h, v17.8h\n"
- "zip2 v24.8h, v18.8h, v16.8h\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v3.8h, v31.8h, v23.8h\n"
- "zip1 v2.8h, v30.8h, v22.8h\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v3.8h, v21.8h, v17.8h\n"
+ "zip1 v2.8h, v20.8h, v16.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v1.8h, v21.8h, v17.8h\n"
+ "zip2 v24.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip2 v1.8h, v31.8h, v23.8h\n"
- "zip2 v0.8h, v30.8h, v22.8h\n"
- "zip1 v23.8h, v29.8h, v21.8h\n"
- "zip1 v22.8h, v28.8h, v20.8h\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "zip2 v20.8h, v28.8h, v20.8h\n"
- "zip1 v31.8h, v19.8h, v17.8h\n"
- "zip1 v30.8h, v18.8h, v16.8h\n"
- "zip2 v29.8h, v19.8h, v17.8h\n"
- "zip2 v28.8h, v18.8h, v16.8h\n"
- "zip1 v19.8h, v27.8h, v26.8h\n"
- "zip2 v18.8h, v27.8h, v26.8h\n"
- "zip1 v17.8h, v25.8h, v24.8h\n"
- "zip2 v16.8h, v25.8h, v24.8h\n"
- "zip1 v27.8h, v23.8h, v22.8h\n"
- "zip2 v26.8h, v23.8h, v22.8h\n"
- "zip1 v25.8h, v21.8h, v20.8h\n"
+ "zip1 v0.8h, v19.8h, v17.8h\n"
+ "zip1 v31.8h, v18.8h, v16.8h\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v30.8h, v19.8h, v17.8h\n"
+ "zip2 v29.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v23.8h, v21.8h, v17.8h\n"
+ "zip1 v22.8h, v20.8h, v16.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v21.8h, v21.8h, v17.8h\n"
+ "zip2 v20.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v28.8h, v19.8h, v17.8h\n"
+ "zip1 v27.8h, v18.8h, v16.8h\n"
+ "zip2 v26.8h, v19.8h, v17.8h\n"
+ "zip2 v25.8h, v18.8h, v16.8h\n"
+ "zip1 v16.8h, v3.8h, v2.8h\n"
+ "zip2 v17.8h, v3.8h, v2.8h\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.8h, v1.8h, v24.8h\n"
+ "zip2 v19.8h, v1.8h, v24.8h\n"
+ "str q17, [x21, #0x10]\n"
+ "zip1 v18.8h, v23.8h, v22.8h\n"
+ "zip2 v17.8h, v23.8h, v22.8h\n"
+ "str q16, [x21, #0x20]\n"
+ "zip1 v16.8h, v21.8h, v20.8h\n"
"zip2 v24.8h, v21.8h, v20.8h\n"
- "str q19, [x27, #0x0]\n"
- "zip1 v23.8h, v3.8h, v2.8h\n"
- "zip2 v22.8h, v3.8h, v2.8h\n"
- "str q18, [x27, #0x10]\n"
- "zip1 v21.8h, v1.8h, v0.8h\n"
- "zip2 v20.8h, v1.8h, v0.8h\n"
- "str q17, [x27, #0x20]\n"
- "zip1 v19.8h, v31.8h, v30.8h\n"
- "zip2 v18.8h, v31.8h, v30.8h\n"
- "str q16, [x27, #0x30]\n"
- "zip1 v17.8h, v29.8h, v28.8h\n"
- "zip2 v16.8h, v29.8h, v28.8h\n"
- "str q27, [x27, #0x40]\n"
- "str q26, [x27, #0x50]\n"
- "str q25, [x27, #0x60]\n"
- "str q24, [x27, #0x70]\n"
- "str q23, [x27, #0x80]\n"
- "str q22, [x27, #0x90]\n"
- "str q21, [x27, #0xa0]\n"
- "str q20, [x27, #0xb0]\n"
- "str q19, [x27, #0xc0]\n"
- "str q18, [x27, #0xd0]\n"
- "str q17, [x27, #0xe0]\n"
- "str q16, [x27, #0xf0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "str q19, [x21, #0x30]\n"
+ "zip1 v23.8h, v0.8h, v31.8h\n"
+ "zip2 v22.8h, v0.8h, v31.8h\n"
+ "str q18, [x21, #0x40]\n"
+ "zip1 v21.8h, v30.8h, v29.8h\n"
+ "zip2 v20.8h, v30.8h, v29.8h\n"
+ "str q17, [x21, #0x50]\n"
+ "zip1 v19.8h, v28.8h, v27.8h\n"
+ "zip2 v18.8h, v28.8h, v27.8h\n"
+ "str q16, [x21, #0x60]\n"
+ "zip1 v17.8h, v26.8h, v25.8h\n"
+ "zip2 v16.8h, v26.8h, v25.8h\n"
+ "str q24, [x21, #0x70]\n"
+ "str q23, [x21, #0x80]\n"
+ "str q22, [x21, #0x90]\n"
+ "str q21, [x21, #0xa0]\n"
+ "str q20, [x21, #0xb0]\n"
+ "str q19, [x21, #0xc0]\n"
+ "str q18, [x21, #0xd0]\n"
+ "str q17, [x21, #0xe0]\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "str q16, [x27, #0xc0]\n"
- "str q16, [x27, #0xd0]\n"
- "str q16, [x27, #0xe0]\n"
- "str q16, [x27, #0xf0]\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr d23, [x9], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "sub x28, x28, #0x4\n"
+ "ldr d19, [x9], #0x8\n"
+ "ldr d18, [x28], #0x8\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
+ "zip1 v17.8h, v19.8h, v17.8h\n"
+ "zip1 v16.8h, v18.8h, v16.8h\n"
"ldr d18, [x25], #0x8\n"
- "ldr d17, [x24], #0x8\n"
- "cmp x28, #0x4\n"
- "ldr d20, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "zip1 v20.8h, v17.8h, v16.8h\n"
+ "zip2 v19.8h, v17.8h, v16.8h\n"
+ "ldr d17, [x22], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "zip1 v18.8h, v23.8h, v18.8h\n"
- "zip1 v17.8h, v22.8h, v17.8h\n"
- "zip1 v20.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v18.8h, v17.8h\n"
"zip1 v16.8h, v21.8h, v16.8h\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "zip1 v17.8h, v20.8h, v16.8h\n"
- "zip2 v16.8h, v20.8h, v16.8h\n"
- "str q19, [x27, #0x0]\n"
- "str q18, [x27, #0x10]\n"
- "str q17, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "add x27, x27, #0x20\n"
+ "str q20, [x21, #0x0]\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q19, [x21, #0x10]\n"
+ "str q17, [x21, #0x80]\n"
+ "str q16, [x21, #0x90]\n"
+ "add x21, x21, #0x20\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr h23, [x9], #0x2\n"
- "ldr h22, [x26], #0x2\n"
- "sub x28, x28, #0x1\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h17, [x24], #0x2\n"
- "cmp x28, #0x1\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h20, [x22], #0x2\n"
- "ldr h18, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
- "zip1 v19.8h, v23.8h, v19.8h\n"
- "zip1 v17.8h, v22.8h, v17.8h\n"
- "zip1 v18.8h, v21.8h, v18.8h\n"
- "zip1 v16.8h, v20.8h, v16.8h\n"
+ "ldr h19, [x9], #0x2\n"
+ "ldr h18, [x28], #0x2\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h16, [x26], #0x2\n"
"zip1 v17.8h, v19.8h, v17.8h\n"
"zip1 v16.8h, v18.8h, v16.8h\n"
- "str d17, [x27, #0x0]\n"
- "str d16, [x27, #0x80]\n"
- "add x27, x27, #0x8\n"
+ "ldr h20, [x25], #0x2\n"
+ "ldr h19, [x23], #0x2\n"
+ "zip1 v18.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "zip1 v17.8h, v20.8h, v17.8h\n"
+ "zip1 v16.8h, v19.8h, v16.8h\n"
+ "str d18, [x21, #0x0]\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str d16, [x21, #0x80]\n"
+ "add x21, x21, #0x8\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0x100\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x9, %x[in]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x26, x27, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "csel x25, x25, %x[pad_row], GE\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x26, %x[in_stride]\n"
"csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x20\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
- "ldr q20, [x9], #0x10\n"
- "ldr q19, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
"sub x20, x20, #0x20\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x20\n"
- "ldr q23, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "ldr q29, [x9], #0x10\n"
- "zip1 v4.8h, v20.8h, v18.8h\n"
- "zip1 v3.8h, v19.8h, v17.8h\n"
- "ldr q28, [x26], #0x10\n"
- "ldr q27, [x25], #0x10\n"
- "zip2 v2.8h, v20.8h, v18.8h\n"
- "zip2 v1.8h, v19.8h, v17.8h\n"
- "ldr q20, [x24], #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v4.8h, v21.8h, v17.8h\n"
+ "zip1 v3.8h, v20.8h, v16.8h\n"
"ldr q19, [x9], #0x10\n"
- "zip1 v26.8h, v23.8h, v21.8h\n"
- "zip1 v25.8h, v22.8h, v16.8h\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "zip2 v24.8h, v23.8h, v21.8h\n"
- "zip2 v23.8h, v22.8h, v16.8h\n"
- "ldr q16, [x24], #0x10\n"
- "zip1 v22.8h, v29.8h, v27.8h\n"
- "zip1 v21.8h, v28.8h, v20.8h\n"
- "zip2 v0.8h, v29.8h, v27.8h\n"
- "zip2 v31.8h, v28.8h, v20.8h\n"
- "zip1 v30.8h, v19.8h, v17.8h\n"
- "zip1 v29.8h, v18.8h, v16.8h\n"
- "zip2 v28.8h, v19.8h, v17.8h\n"
- "zip2 v27.8h, v18.8h, v16.8h\n"
- "zip1 v20.8h, v4.8h, v3.8h\n"
- "zip2 v19.8h, v4.8h, v3.8h\n"
- "zip1 v18.8h, v2.8h, v1.8h\n"
- "zip2 v17.8h, v2.8h, v1.8h\n"
- "zip1 v16.8h, v26.8h, v25.8h\n"
- "zip2 v26.8h, v26.8h, v25.8h\n"
- "zip1 v25.8h, v24.8h, v23.8h\n"
- "zip2 v24.8h, v24.8h, v23.8h\n"
- "str q20, [x27, #0x0]\n"
- "str q19, [x27, #0x10]\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v2.8h, v21.8h, v17.8h\n"
+ "zip2 v1.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v0.8h, v19.8h, v17.8h\n"
+ "zip1 v31.8h, v18.8h, v16.8h\n"
+ "ldr q24, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v30.8h, v19.8h, v17.8h\n"
+ "zip2 v23.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.8h, v24.8h, v17.8h\n"
+ "zip1 v21.8h, v20.8h, v16.8h\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v29.8h, v24.8h, v17.8h\n"
+ "zip2 v28.8h, v20.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v27.8h, v19.8h, v17.8h\n"
+ "zip1 v26.8h, v18.8h, v16.8h\n"
+ "zip2 v25.8h, v19.8h, v17.8h\n"
+ "zip2 v24.8h, v18.8h, v16.8h\n"
+ "zip1 v16.8h, v4.8h, v3.8h\n"
+ "zip2 v17.8h, v4.8h, v3.8h\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.8h, v2.8h, v1.8h\n"
+ "zip2 v20.8h, v2.8h, v1.8h\n"
+ "str q17, [x21, #0x10]\n"
+ "zip1 v19.8h, v0.8h, v31.8h\n"
+ "zip2 v18.8h, v0.8h, v31.8h\n"
+ "str q16, [x21, #0x20]\n"
+ "zip1 v17.8h, v30.8h, v23.8h\n"
+ "zip2 v16.8h, v30.8h, v23.8h\n"
+ "str q20, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
"zip1 v23.8h, v22.8h, v21.8h\n"
"zip2 v22.8h, v22.8h, v21.8h\n"
- "str q18, [x27, #0x20]\n"
- "zip1 v21.8h, v0.8h, v31.8h\n"
- "zip2 v20.8h, v0.8h, v31.8h\n"
- "str q17, [x27, #0x30]\n"
- "zip1 v19.8h, v30.8h, v29.8h\n"
- "zip2 v18.8h, v30.8h, v29.8h\n"
- "str q16, [x27, #0x40]\n"
- "zip1 v17.8h, v28.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v27.8h\n"
- "str q26, [x27, #0x50]\n"
- "str q25, [x27, #0x60]\n"
- "str q24, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
- "str q23, [x27, #0x0]\n"
- "str q22, [x27, #0x10]\n"
- "str q21, [x27, #0x20]\n"
- "str q20, [x27, #0x30]\n"
- "str q19, [x27, #0x40]\n"
- "str q18, [x27, #0x50]\n"
- "str q17, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "str q18, [x21, #0x50]\n"
+ "zip1 v21.8h, v29.8h, v28.8h\n"
+ "zip2 v20.8h, v29.8h, v28.8h\n"
+ "str q17, [x21, #0x60]\n"
+ "zip1 v19.8h, v27.8h, v26.8h\n"
+ "zip2 v18.8h, v27.8h, v26.8h\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip1 v17.8h, v25.8h, v24.8h\n"
+ "zip2 v16.8h, v25.8h, v24.8h\n"
+ "str q23, [x21, #0x0]\n"
+ "str q22, [x21, #0x10]\n"
+ "str q21, [x21, #0x20]\n"
+ "str q20, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0x10\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
- "ldr q20, [x9], #0x10\n"
- "ldr q19, [x26], #0x10\n"
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x10\n"
- "ldr q24, [x9], #0x10\n"
- "ldr q25, [x26], #0x10\n"
- "ldr q23, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "zip1 v22.8h, v20.8h, v18.8h\n"
- "zip1 v21.8h, v19.8h, v17.8h\n"
- "zip2 v20.8h, v20.8h, v18.8h\n"
- "zip2 v19.8h, v19.8h, v17.8h\n"
- "zip1 v18.8h, v24.8h, v23.8h\n"
- "zip1 v17.8h, v25.8h, v16.8h\n"
- "zip2 v24.8h, v24.8h, v23.8h\n"
- "zip2 v16.8h, v25.8h, v16.8h\n"
- "zip1 v23.8h, v22.8h, v21.8h\n"
- "zip2 v22.8h, v22.8h, v21.8h\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v25.8h, v19.8h, v17.8h\n"
+ "zip1 v24.8h, v18.8h, v16.8h\n"
+ "ldr q22, [x9], #0x10\n"
+ "ldr q21, [x28], #0x10\n"
+ "zip2 v20.8h, v19.8h, v17.8h\n"
+ "zip2 v19.8h, v18.8h, v16.8h\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v23.8h, v22.8h, v17.8h\n"
+ "zip1 v18.8h, v21.8h, v16.8h\n"
+ "zip2 v22.8h, v22.8h, v17.8h\n"
+ "zip2 v21.8h, v21.8h, v16.8h\n"
+ "zip1 v16.8h, v25.8h, v24.8h\n"
+ "zip2 v17.8h, v25.8h, v24.8h\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.8h, v20.8h, v19.8h\n"
"zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "zip1 v17.8h, v24.8h, v16.8h\n"
- "zip2 v16.8h, v24.8h, v16.8h\n"
- "str q23, [x27, #0x0]\n"
- "str q22, [x27, #0x10]\n"
- "str q21, [x27, #0x20]\n"
- "str q20, [x27, #0x30]\n"
- "str q19, [x27, #0x40]\n"
- "str q18, [x27, #0x50]\n"
- "str q17, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q17, [x21, #0x10]\n"
+ "zip1 v19.8h, v23.8h, v18.8h\n"
+ "zip2 v18.8h, v23.8h, v18.8h\n"
+ "str q16, [x21, #0x20]\n"
+ "zip1 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v22.8h, v21.8h\n"
+ "str q20, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d18, [x9], #0x8\n"
- "ldr d19, [x26], #0x8\n"
+ "ldr d19, [x28], #0x8\n"
"sub x20, x20, #0x4\n"
- "ldr d17, [x25], #0x8\n"
- "ldr d16, [x24], #0x8\n"
"cmp x20, #0x4\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d16, [x26], #0x8\n"
"zip1 v18.8h, v18.8h, v17.8h\n"
"zip1 v16.8h, v19.8h, v16.8h\n"
"zip1 v17.8h, v18.8h, v16.8h\n"
"zip2 v16.8h, v18.8h, v16.8h\n"
- "str q17, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q17, [x21, #0x0]\n"
+ "str q16, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h19, [x9], #0x2\n"
- "ldr h18, [x26], #0x2\n"
+ "ldr h18, [x28], #0x2\n"
"sub x20, x20, #0x1\n"
- "ldr h17, [x25], #0x2\n"
- "ldr h16, [x24], #0x2\n"
"cmp x20, #0x1\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h16, [x26], #0x2\n"
"zip1 v17.8h, v19.8h, v17.8h\n"
"zip1 v16.8h, v18.8h, v16.8h\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x80\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
index a2f64768da..435398da0b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
@@ -40,313 +40,294 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 9f\n"
+ "blt 8f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x10\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x10\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
- "ldr q2, [x9], #0x10\n"
- "ldr q3, [x26], #0x10\n"
- "sub x28, x28, #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "ldr q4, [x24], #0x10\n"
- "cmp x28, #0x10\n"
- "ldr q16, [x23], #0x10\n"
+ "ldr q13, [x9], #0x10\n"
+ "ldr q12, [x28], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q1, [x27], #0x10\n"
+ "ldr q9, [x26], #0x10\n"
+ "zip1 v19.4s, v13.4s, v1.4s\n"
+ "zip1 v14.4s, v12.4s, v9.4s\n"
+ "ldr q15, [x25], #0x10\n"
+ "ldr q4, [x23], #0x10\n"
+ "zip2 v8.4s, v13.4s, v1.4s\n"
+ "zip2 v28.4s, v12.4s, v9.4s\n"
+ "ldr q0, [x22], #0x10\n"
+ "ldr q1, [x20], #0x10\n"
+ "zip1 v16.4s, v15.4s, v0.4s\n"
+ "zip1 v5.4s, v4.4s, v1.4s\n"
+ "ldr q25, [x9], #0x10\n"
+ "ldr q24, [x28], #0x10\n"
+ "zip2 v3.4s, v15.4s, v0.4s\n"
+ "zip2 v2.4s, v4.4s, v1.4s\n"
+ "ldr q21, [x27], #0x10\n"
+ "ldr q30, [x26], #0x10\n"
+ "zip1 v18.4s, v25.4s, v21.4s\n"
+ "zip1 v27.4s, v24.4s, v30.4s\n"
+ "ldr q22, [x25], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip2 v9.4s, v25.4s, v21.4s\n"
+ "zip2 v10.4s, v24.4s, v30.4s\n"
"ldr q1, [x22], #0x10\n"
- "ldr q5, [x21], #0x10\n"
- "ldr q15, [x20], #0x10\n"
- "ldr q19, [x9], #0x10\n"
- "ldr q11, [x26], #0x10\n"
- "zip1 v14.4s, v2.4s, v17.4s\n"
- "zip1 v27.4s, v3.4s, v4.4s\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q18, [x24], #0x10\n"
- "zip2 v12.4s, v2.4s, v17.4s\n"
- "zip2 v7.4s, v3.4s, v4.4s\n"
- "ldr q22, [x23], #0x10\n"
- "ldr q9, [x22], #0x10\n"
- "zip1 v17.4s, v16.4s, v5.4s\n"
- "zip1 v29.4s, v1.4s, v15.4s\n"
- "ldr q30, [x21], #0x10\n"
- "ldr q8, [x20], #0x10\n"
- "zip2 v28.4s, v16.4s, v5.4s\n"
- "zip2 v26.4s, v1.4s, v15.4s\n"
+ "ldr q21, [x20], #0x10\n"
+ "zip1 v25.4s, v22.4s, v1.4s\n"
+ "zip1 v7.4s, v20.4s, v21.4s\n"
"ldr q31, [x9], #0x10\n"
- "ldr q13, [x26], #0x10\n"
- "zip1 v20.4s, v19.4s, v21.4s\n"
- "zip1 v10.4s, v11.4s, v18.4s\n"
- "ldr q2, [x25], #0x10\n"
- "ldr q23, [x24], #0x10\n"
- "zip2 v4.4s, v19.4s, v21.4s\n"
- "zip2 v6.4s, v11.4s, v18.4s\n"
- "ldr q21, [x23], #0x10\n"
+ "ldr q17, [x28], #0x10\n"
+ "zip2 v30.4s, v22.4s, v1.4s\n"
+ "zip2 v20.4s, v20.4s, v21.4s\n"
+ "ldr q15, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v6.4s, v31.4s, v15.4s\n"
+ "zip1 v4.4s, v17.4s, v24.4s\n"
+ "ldr q12, [x25], #0x10\n"
+ "ldr q29, [x23], #0x10\n"
+ "zip2 v22.4s, v31.4s, v15.4s\n"
+ "zip2 v26.4s, v17.4s, v24.4s\n"
+ "ldr q0, [x22], #0x10\n"
+ "ldr q24, [x20], #0x10\n"
+ "zip1 v17.4s, v12.4s, v0.4s\n"
+ "zip1 v31.4s, v29.4s, v24.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q1, [x28], #0x10\n"
+ "zip2 v23.4s, v12.4s, v0.4s\n"
+ "zip2 v24.4s, v29.4s, v24.4s\n"
+ "ldr q11, [x27], #0x10\n"
+ "ldr q29, [x26], #0x10\n"
+ "zip1 v0.4s, v21.4s, v11.4s\n"
+ "zip1 v13.4s, v1.4s, v29.4s\n"
+ "ldr q15, [x25], #0x10\n"
+ "ldr q12, [x23], #0x10\n"
+ "zip2 v21.4s, v21.4s, v11.4s\n"
+ "zip2 v29.4s, v1.4s, v29.4s\n"
"ldr q1, [x22], #0x10\n"
- "zip1 v24.4s, v22.4s, v30.4s\n"
- "zip1 v16.4s, v9.4s, v8.4s\n"
- "ldr q18, [x21], #0x10\n"
- "ldr q15, [x20], #0x10\n"
- "zip2 v22.4s, v22.4s, v30.4s\n"
- "zip2 v25.4s, v9.4s, v8.4s\n"
- "ldr q3, [x9], #0x10\n"
- "ldr q5, [x26], #0x10\n"
- "zip1 v8.4s, v31.4s, v2.4s\n"
- "zip1 v30.4s, v13.4s, v23.4s\n"
- "ldr q0, [x25], #0x10\n"
- "ldr q19, [x24], #0x10\n"
- "zip2 v31.4s, v31.4s, v2.4s\n"
- "zip2 v2.4s, v13.4s, v23.4s\n"
- "ldr q11, [x23], #0x10\n"
- "ldr q9, [x22], #0x10\n"
- "zip1 v13.4s, v21.4s, v18.4s\n"
- "zip1 v23.4s, v1.4s, v15.4s\n"
- "zip2 v21.4s, v21.4s, v18.4s\n"
- "ldr q18, [x21], #0x10\n"
- "zip2 v15.4s, v1.4s, v15.4s\n"
- "zip1 v1.4s, v3.4s, v0.4s\n"
- "zip2 v0.4s, v3.4s, v0.4s\n"
- "zip1 v3.4s, v5.4s, v19.4s\n"
- "zip2 v5.4s, v5.4s, v19.4s\n"
- "zip1 v19.4s, v11.4s, v18.4s\n"
- "zip2 v18.4s, v11.4s, v18.4s\n"
- "zip1 v11.4s, v14.4s, v27.4s\n"
- "zip2 v14.4s, v14.4s, v27.4s\n"
- "ldr q27, [x20], #0x10\n"
- ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
- ".inst 0x4ea169cb // bfcvtn2 v11.8h, v14.4s\n"
- "zip1 v14.4s, v12.4s, v7.4s\n"
- "zip2 v7.4s, v12.4s, v7.4s\n"
- "zip1 v12.4s, v9.4s, v27.4s\n"
- "zip2 v27.4s, v9.4s, v27.4s\n"
- "zip1 v9.4s, v20.4s, v10.4s\n"
- "zip2 v20.4s, v20.4s, v10.4s\n"
- "str q11, [x27, #0x0]\n"
- "zip1 v10.4s, v4.4s, v6.4s\n"
- "zip1 v11.4s, v8.4s, v30.4s\n"
- ".inst 0x0ea169ce // bfcvtn v14.4h, v14.4s\n"
- "zip2 v4.4s, v4.4s, v6.4s\n"
- "zip1 v6.4s, v31.4s, v2.4s\n"
- ".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n"
- ".inst 0x0ea1694a // bfcvtn v10.4h, v10.4s\n"
- ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
- "zip2 v30.4s, v8.4s, v30.4s\n"
- "zip1 v8.4s, v1.4s, v3.4s\n"
- ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
- "zip2 v31.4s, v31.4s, v2.4s\n"
- "zip1 v2.4s, v0.4s, v5.4s\n"
- "zip2 v3.4s, v1.4s, v3.4s\n"
- "zip1 v1.4s, v17.4s, v29.4s\n"
+ "zip1 v11.4s, v15.4s, v1.4s\n"
+ "zip2 v1.4s, v15.4s, v1.4s\n"
+ "zip1 v15.4s, v19.4s, v14.4s\n"
+ ".inst 0x0ea169ef // bfcvtn v15.4h, v15.4s\n"
+ "zip2 v14.4s, v19.4s, v14.4s\n"
+ "ldr q19, [x20], #0x10\n"
+ ".inst 0x4ea169cf // bfcvtn2 v15.8h, v14.4s\n"
+ "str q15, [x21, #0x0]\n"
+ "zip1 v14.4s, v12.4s, v19.4s\n"
+ "zip2 v15.4s, v12.4s, v19.4s\n"
+ "zip1 v12.4s, v8.4s, v28.4s\n"
+ "zip1 v19.4s, v18.4s, v27.4s\n"
+ ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n"
+ "zip2 v28.4s, v8.4s, v28.4s\n"
+ "zip1 v8.4s, v9.4s, v10.4s\n"
+ ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
+ "zip2 v18.4s, v18.4s, v27.4s\n"
+ "zip1 v27.4s, v6.4s, v4.4s\n"
".inst 0x0ea16908 // bfcvtn v8.4h, v8.4s\n"
- "zip2 v5.4s, v0.4s, v5.4s\n"
- "zip1 v0.4s, v28.4s, v26.4s\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "zip2 v17.4s, v17.4s, v29.4s\n"
- "zip1 v29.4s, v24.4s, v16.4s\n"
- ".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
+ "zip2 v10.4s, v9.4s, v10.4s\n"
+ "zip1 v9.4s, v22.4s, v26.4s\n"
+ ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n"
+ "zip2 v6.4s, v6.4s, v4.4s\n"
+ "zip1 v4.4s, v0.4s, v13.4s\n"
+ ".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n"
+ "zip2 v22.4s, v22.4s, v26.4s\n"
+ "zip1 v26.4s, v21.4s, v29.4s\n"
+ ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
+ "zip2 v13.4s, v0.4s, v13.4s\n"
+ "zip1 v0.4s, v16.4s, v5.4s\n"
+ ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n"
+ "zip2 v21.4s, v21.4s, v29.4s\n"
+ "zip1 v29.4s, v3.4s, v2.4s\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- "zip2 v28.4s, v28.4s, v26.4s\n"
- "zip1 v26.4s, v22.4s, v25.4s\n"
- "zip2 v24.4s, v24.4s, v16.4s\n"
- "zip1 v16.4s, v13.4s, v23.4s\n"
+ "zip2 v5.4s, v16.4s, v5.4s\n"
+ "zip1 v16.4s, v25.4s, v7.4s\n"
".inst 0x0ea16bbd // bfcvtn v29.4h, v29.4s\n"
- "zip2 v25.4s, v22.4s, v25.4s\n"
- "zip1 v22.4s, v21.4s, v15.4s\n"
- ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n"
- "zip2 v13.4s, v13.4s, v23.4s\n"
- "zip1 v23.4s, v19.4s, v12.4s\n"
+ "zip2 v2.4s, v3.4s, v2.4s\n"
+ "zip1 v3.4s, v30.4s, v20.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n"
- "zip2 v21.4s, v21.4s, v15.4s\n"
- "zip1 v15.4s, v18.4s, v27.4s\n"
- "zip2 v19.4s, v19.4s, v12.4s\n"
+ "zip2 v7.4s, v25.4s, v7.4s\n"
+ "zip1 v25.4s, v17.4s, v31.4s\n"
+ ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
+ "zip2 v30.4s, v30.4s, v20.4s\n"
+ "zip1 v20.4s, v23.4s, v24.4s\n"
+ ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n"
+ "zip2 v17.4s, v17.4s, v31.4s\n"
+ "zip1 v31.4s, v11.4s, v14.4s\n"
+ ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
+ "zip2 v24.4s, v23.4s, v24.4s\n"
+ "zip1 v23.4s, v1.4s, v15.4s\n"
+ ".inst 0x0ea16bff // bfcvtn v31.4h, v31.4s\n"
+ "zip2 v14.4s, v11.4s, v14.4s\n"
".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
- "zip2 v12.4s, v18.4s, v27.4s\n"
- ".inst 0x4ea168ee // bfcvtn2 v14.8h, v7.4s\n"
- ".inst 0x4ea16a89 // bfcvtn2 v9.8h, v20.4s\n"
- ".inst 0x0ea169f2 // bfcvtn v18.4h, v15.4s\n"
- ".inst 0x4ea1688a // bfcvtn2 v10.8h, v4.4s\n"
- ".inst 0x4ea16bcb // bfcvtn2 v11.8h, v30.4s\n"
- ".inst 0x4ea16be6 // bfcvtn2 v6.8h, v31.4s\n"
- ".inst 0x4ea16868 // bfcvtn2 v8.8h, v3.4s\n"
- ".inst 0x4ea168a2 // bfcvtn2 v2.8h, v5.4s\n"
- "str q14, [x27, #0x10]\n"
- ".inst 0x4ea16a21 // bfcvtn2 v1.8h, v17.4s\n"
- ".inst 0x4ea16b80 // bfcvtn2 v0.8h, v28.4s\n"
- "str q9, [x27, #0x20]\n"
- ".inst 0x4ea16b1d // bfcvtn2 v29.8h, v24.4s\n"
- ".inst 0x4ea16b3a // bfcvtn2 v26.8h, v25.4s\n"
- "str q10, [x27, #0x30]\n"
- ".inst 0x4ea169b0 // bfcvtn2 v16.8h, v13.4s\n"
- ".inst 0x4ea16ab6 // bfcvtn2 v22.8h, v21.4s\n"
- "str q11, [x27, #0x40]\n"
- ".inst 0x4ea16a77 // bfcvtn2 v23.8h, v19.4s\n"
- ".inst 0x4ea16992 // bfcvtn2 v18.8h, v12.4s\n"
- "str q6, [x27, #0x50]\n"
- "str q8, [x27, #0x60]\n"
- "str q2, [x27, #0x70]\n"
- "str q1, [x27, #0x80]\n"
- "str q0, [x27, #0x90]\n"
- "str q29, [x27, #0xa0]\n"
- "str q26, [x27, #0xb0]\n"
- "str q16, [x27, #0xc0]\n"
- "str q22, [x27, #0xd0]\n"
- "str q23, [x27, #0xe0]\n"
- "str q18, [x27, #0xf0]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip2 v1.4s, v1.4s, v15.4s\n"
+ ".inst 0x4ea16b8c // bfcvtn2 v12.8h, v28.4s\n"
+ "str q12, [x21, #0x10]\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16948 // bfcvtn2 v8.8h, v10.4s\n"
+ "str q19, [x21, #0x20]\n"
+ ".inst 0x4ea168db // bfcvtn2 v27.8h, v6.4s\n"
+ ".inst 0x4ea16ac9 // bfcvtn2 v9.8h, v22.4s\n"
+ "str q8, [x21, #0x30]\n"
+ ".inst 0x4ea169a4 // bfcvtn2 v4.8h, v13.4s\n"
+ ".inst 0x4ea16aba // bfcvtn2 v26.8h, v21.4s\n"
+ "str q27, [x21, #0x40]\n"
+ ".inst 0x4ea168a0 // bfcvtn2 v0.8h, v5.4s\n"
+ ".inst 0x4ea1685d // bfcvtn2 v29.8h, v2.4s\n"
+ "str q9, [x21, #0x50]\n"
+ ".inst 0x4ea168f0 // bfcvtn2 v16.8h, v7.4s\n"
+ ".inst 0x4ea16bc3 // bfcvtn2 v3.8h, v30.4s\n"
+ "str q4, [x21, #0x60]\n"
+ ".inst 0x4ea16a39 // bfcvtn2 v25.8h, v17.4s\n"
+ ".inst 0x4ea16b14 // bfcvtn2 v20.8h, v24.4s\n"
+ "str q26, [x21, #0x70]\n"
+ ".inst 0x4ea169df // bfcvtn2 v31.8h, v14.4s\n"
+ ".inst 0x4ea16837 // bfcvtn2 v23.8h, v1.4s\n"
+ "str q0, [x21, #0x80]\n"
+ "str q29, [x21, #0x90]\n"
+ "str q16, [x21, #0xa0]\n"
+ "str q3, [x21, #0xb0]\n"
+ "str q25, [x21, #0xc0]\n"
+ "str q20, [x21, #0xd0]\n"
+ "str q31, [x21, #0xe0]\n"
+ "str q23, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x28, 8f\n"
- "cmp x28, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "str q16, [x27, #0xc0]\n"
- "str q16, [x27, #0xd0]\n"
- "str q16, [x27, #0xe0]\n"
- "str q16, [x27, #0xf0]\n"
+ "cmp x24, #0x4\n"
"blt 5f\n"
"4:" // Main row loop: width 4 loop: loop
- "ldr q25, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "sub x28, x28, #0x4\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "cmp x28, #0x4\n"
- "ldr q23, [x23], #0x10\n"
- "ldr q19, [x22], #0x10\n"
- "ldr q18, [x21], #0x10\n"
- "ldr q17, [x20], #0x10\n"
- "zip1 v22.4s, v25.4s, v21.4s\n"
- "zip1 v16.4s, v24.4s, v20.4s\n"
- "zip2 v21.4s, v25.4s, v21.4s\n"
- "zip2 v20.4s, v24.4s, v20.4s\n"
- "zip1 v27.4s, v23.4s, v18.4s\n"
- "zip1 v26.4s, v19.4s, v17.4s\n"
- "zip2 v25.4s, v23.4s, v18.4s\n"
- "zip2 v24.4s, v19.4s, v17.4s\n"
- "zip1 v19.4s, v22.4s, v16.4s\n"
- "zip1 v18.4s, v21.4s, v20.4s\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v23.4s, v17.4s\n"
+ "zip1 v21.4s, v20.4s, v16.4s\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v28.4s, v23.4s, v17.4s\n"
+ "zip2 v20.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v27.4s, v19.4s, v17.4s\n"
+ "zip1 v26.4s, v18.4s, v16.4s\n"
+ "zip2 v25.4s, v19.4s, v17.4s\n"
+ "zip2 v24.4s, v18.4s, v16.4s\n"
+ "zip1 v19.4s, v22.4s, v21.4s\n"
+ "zip1 v18.4s, v28.4s, v20.4s\n"
"zip1 v17.4s, v27.4s, v26.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
"zip1 v16.4s, v25.4s, v24.4s\n"
- "zip2 v22.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
- ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
+ ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
+ "zip2 v22.4s, v22.4s, v21.4s\n"
+ ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
+ "zip2 v20.4s, v28.4s, v20.4s\n"
".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
"zip2 v18.4s, v27.4s, v26.4s\n"
".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
"zip2 v16.4s, v25.4s, v24.4s\n"
- ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
- ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
+ ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
+ ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q23, [x21, #0x0]\n"
".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0x80]\n"
- "str q17, [x27, #0x90]\n"
- "add x27, x27, #0x20\n"
+ "str q21, [x21, #0x10]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q17, [x21, #0x90]\n"
+ "add x21, x21, #0x20\n"
"bge 4b\n"
"5:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 7f\n"
"6:" // Main row loop: width 1 loop: loop
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "sub x28, x28, #0x1\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s17, [x24], #0x4\n"
- "cmp x28, #0x1\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v19.4s, v23.4s, v19.4s\n"
- "zip1 v17.4s, v22.4s, v17.4s\n"
- "zip1 v18.4s, v21.4s, v18.4s\n"
- "zip1 v16.4s, v20.4s, v16.4s\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.4s, v19.4s, v17.4s\n"
"zip1 v16.4s, v18.4s, v16.4s\n"
- ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
+ ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "zip1 v17.4s, v20.4s, v17.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- "str d17, [x27, #0x0]\n"
- "str d16, [x27, #0x80]\n"
- "add x27, x27, #0x8\n"
+ "str d18, [x21, #0x0]\n"
+ "str d16, [x21, #0x80]\n"
+ "add x21, x21, #0x8\n"
"bge 6b\n"
"7:" // Main row loop: width 1 loop: skip
- "8:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0x100\n"
"bge 1b\n"
- "cbz %x[height], 18f\n"
- "9:" // Main loop skip
- "10:" // Tail row loop: Head
+ "cbz %x[height], 16f\n"
+ "8:" // Main loop skip
+ "9:" // Tail row loop: Head
"mov x9, %x[in]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x26, x27, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "csel x25, x25, %x[pad_row], GE\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x26, %x[in_stride]\n"
"csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x10\n"
- "blt 12f\n"
- "11:" // Tail row loop: Column loop
- "ldr q20, [x9], #0x10\n"
- "ldr q19, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 11f\n"
+ "10:" // Tail row loop: Column loop
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x10\n"
- "ldr q0, [x9], #0x10\n"
- "ldr q31, [x26], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "ldr q23, [x9], #0x10\n"
- "zip1 v30.4s, v20.4s, v18.4s\n"
- "zip1 v29.4s, v19.4s, v17.4s\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip2 v28.4s, v20.4s, v18.4s\n"
- "zip2 v27.4s, v19.4s, v17.4s\n"
- "ldr q20, [x24], #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v30.4s, v21.4s, v17.4s\n"
+ "zip1 v29.4s, v20.4s, v16.4s\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v28.4s, v21.4s, v17.4s\n"
+ "zip2 v27.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v26.4s, v19.4s, v17.4s\n"
+ "zip1 v25.4s, v18.4s, v16.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v8.4s, v19.4s, v17.4s\n"
+ "zip2 v24.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v7.4s, v21.4s, v17.4s\n"
+ "zip1 v6.4s, v20.4s, v16.4s\n"
"ldr q19, [x9], #0x10\n"
- "zip1 v26.4s, v0.4s, v24.4s\n"
- "zip1 v25.4s, v31.4s, v16.4s\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "zip2 v8.4s, v0.4s, v24.4s\n"
- "zip2 v24.4s, v31.4s, v16.4s\n"
- "ldr q16, [x24], #0x10\n"
- "zip1 v7.4s, v23.4s, v21.4s\n"
- "zip1 v6.4s, v22.4s, v20.4s\n"
- "zip2 v5.4s, v23.4s, v21.4s\n"
- "zip2 v4.4s, v22.4s, v20.4s\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v5.4s, v21.4s, v17.4s\n"
+ "zip2 v4.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
"zip1 v3.4s, v19.4s, v17.4s\n"
"zip1 v2.4s, v18.4s, v16.4s\n"
"zip2 v1.4s, v19.4s, v17.4s\n"
@@ -377,81 +358,70 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si
"zip2 v16.4s, v1.4s, v0.4s\n"
".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n"
".inst 0x4ea16b9d // bfcvtn2 v29.8h, v28.4s\n"
+ "str q31, [x21, #0x0]\n"
".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
+ "str q29, [x21, #0x10]\n"
".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q27, [x21, #0x20]\n"
".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q31, [x27, #0x0]\n"
- "str q29, [x27, #0x10]\n"
- "str q27, [x27, #0x20]\n"
- "str q25, [x27, #0x30]\n"
- "str q23, [x27, #0x40]\n"
- "str q21, [x27, #0x50]\n"
- "str q19, [x27, #0x60]\n"
- "str q17, [x27, #0x70]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 11b\n"
- "12:" // Tail row loop: Column loop skip
- "cbz x20, 17f\n"
+ "str q25, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q21, [x21, #0x50]\n"
+ "str q19, [x21, #0x60]\n"
+ "str q17, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 10b\n"
+ "11:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "blt 14f\n"
- "13:" // Tail row loop: width 4 loop: loop
- "ldr q21, [x9], #0x10\n"
- "ldr q20, [x26], #0x10\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: width 4 loop: loop
+ "ldr q20, [x9], #0x10\n"
+ "ldr q19, [x28], #0x10\n"
"sub x20, x20, #0x4\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x4\n"
- "zip1 v18.4s, v21.4s, v19.4s\n"
- "zip1 v16.4s, v20.4s, v17.4s\n"
- "zip2 v21.4s, v21.4s, v19.4s\n"
- "zip2 v20.4s, v20.4s, v17.4s\n"
- "zip1 v17.4s, v18.4s, v16.4s\n"
- "zip2 v19.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v20.4s, v17.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "zip2 v21.4s, v20.4s, v17.4s\n"
+ "zip2 v20.4s, v19.4s, v16.4s\n"
+ "zip1 v17.4s, v22.4s, v18.4s\n"
"zip1 v16.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
- ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n"
- "str q18, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "bge 13b\n"
- "14:" // Tail row loop: width 4 loop: skip
+ ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
+ "zip2 v18.4s, v22.4s, v18.4s\n"
+ ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
+ "zip2 v16.4s, v21.4s, v20.4s\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
+ "str q19, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 1 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 1 loop: loop
"ldr s19, [x9], #0x4\n"
- "ldr s18, [x26], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
"sub x20, x20, #0x1\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s16, [x24], #0x4\n"
"cmp x20, #0x1\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.4s, v19.4s, v17.4s\n"
"zip1 v16.4s, v18.4s, v16.4s\n"
"zip1 v16.4s, v17.4s, v16.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 1 loop: skip
- "17:" // Tail row loop: odd col skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x80\n"
- "bge 10b\n"
- "18:" // Done
+ "bge 9b\n"
+ "16:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
index 0f00300c54..a49acf1449 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
@@ -34,206 +34,192 @@ void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
"ldr q1, [x25], #0x10\n"
- "ldr q0, [x21], #0x10\n"
+ "ldr q0, [x22], #0x10\n"
"sub x24, x24, #0x18\n"
- "ldr q17, [x25], #0x10\n"
- "ldr q31, [x22], #0x10\n"
"cmp x24, #0x18\n"
- "ldr q16, [x21], #0x10\n"
- "ldr q30, [x20], #0x10\n"
- "ldr q29, [x25], #0x10\n"
- "ldr q28, [x21], #0x10\n"
- "ldr q27, [x22], #0x10\n"
- "dup v26.2d, v17.d[0]\n"
- "dup v25.2d, v31.d[1]\n"
- "ldr q24, [x20], #0x10\n"
- "ldr q23, [x22], #0x10\n"
- "dup v22.2d, v16.d[0]\n"
- "dup v21.2d, v30.d[1]\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q31, [x23], #0x10\n"
+ "dup v30.2d, v17.d[0]\n"
+ "dup v29.2d, v31.d[1]\n"
+ "ldr q16, [x22], #0x10\n"
+ "ldr q28, [x20], #0x10\n"
+ "dup v27.2d, v16.d[0]\n"
+ "dup v26.2d, v28.d[1]\n"
+ "ldr q25, [x25], #0x10\n"
+ "ldr q24, [x22], #0x10\n"
+ "dup v23.2d, v17.d[1]\n"
+ "dup v22.2d, v25.d[1]\n"
+ "ldr q21, [x23], #0x10\n"
"ldr q20, [x20], #0x10\n"
- "dup v19.2d, v17.d[1]\n"
- "dup v18.2d, v29.d[1]\n"
- "str q1, [x23, #0x0]\n"
- "dup v17.2d, v16.d[1]\n"
- "dup v16.2d, v28.d[1]\n"
- "mov v26.d[1], v31.d[0]\n"
- "mov v25.d[1], v27.d[0]\n"
- "mov v22.d[1], v30.d[0]\n"
- "mov v21.d[1], v24.d[0]\n"
- "str q26, [x23, #0x10]\n"
- "str q25, [x23, #0x20]\n"
- "mov v19.d[1], v29.d[0]\n"
- "mov v18.d[1], v27.d[1]\n"
- "str q0, [x23, #0x30]\n"
- "mov v17.d[1], v28.d[0]\n"
- "mov v16.d[1], v24.d[1]\n"
- "str q22, [x23, #0x40]\n"
- "str q21, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x10]\n"
- "str q23, [x23, #0x20]\n"
- "str q17, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q20, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
+ "dup v19.2d, v16.d[1]\n"
+ "dup v18.2d, v24.d[1]\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "mov v30.d[1], v31.d[0]\n"
+ "mov v29.d[1], v21.d[0]\n"
+ "mov v27.d[1], v28.d[0]\n"
+ "mov v26.d[1], v20.d[0]\n"
+ "str q1, [x21, #0x0]\n"
+ "str q30, [x21, #0x10]\n"
+ "mov v23.d[1], v25.d[0]\n"
+ "mov v22.d[1], v21.d[1]\n"
+ "str q29, [x21, #0x20]\n"
+ "mov v19.d[1], v24.d[0]\n"
+ "mov v18.d[1], v20.d[1]\n"
+ "str q0, [x21, #0x30]\n"
+ "str q27, [x21, #0x40]\n"
+ "str q26, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q23, [x21, #0x0]\n"
+ "str q22, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q18, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q25, [x22], #0x10\n"
- "ldr q24, [x20], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
+ "ldr q23, [x20], #0x10\n"
+ "dup v22.2d, v17.d[1]\n"
+ "dup v21.2d, v23.d[1]\n"
+ "ldr q20, [x25], #0x10\n"
+ "ldr q19, [x22], #0x10\n"
"sub x24, x24, #0xc\n"
- "ldr q23, [x25], #0x10\n"
- "ldr q22, [x21], #0x10\n"
"cmp x24, #0xc\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
- "ldr d18, [x20], #0x8\n"
- "dup v17.2d, v25.d[1]\n"
- "dup v16.2d, v24.d[1]\n"
- "str q23, [x23, #0x0]\n"
- "mov v21.d[1], v25.d[0]\n"
- "mov v17.d[1], v20.d[0]\n"
- "mov v19.d[1], v24.d[0]\n"
- "mov v16.d[1], v18.d[0]\n"
- "str q21, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q22, [x23, #0x30]\n"
- "str q19, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "mov v18.d[1], v17.d[0]\n"
+ "mov v22.d[1], v16.d[0]\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "mov v17.d[1], v23.d[0]\n"
+ "mov v21.d[1], v16.d[0]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q17, [x21, #0x40]\n"
+ "str q21, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr d19, [x25], #0x8\n"
- "ldr d18, [x22], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"sub x24, x24, #0x4\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
"cmp x24, #0x4\n"
- "str d19, [x23, #0x0]\n"
- "str d18, [x23, #0x18]\n"
- "str d17, [x23, #0x30]\n"
- "str d16, [x23, #0x48]\n"
- "add x23, x23, #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "str d19, [x21, #0x0]\n"
+ "str d18, [x21, #0x18]\n"
+ "str d17, [x21, #0x30]\n"
+ "str d16, [x21, #0x48]\n"
+ "add x21, x21, #0x8\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr h19, [x25], #0x2\n"
- "ldr h18, [x22], #0x2\n"
+ "ldr h18, [x23], #0x2\n"
"sub x24, x24, #0x1\n"
- "ldr h17, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
"cmp x24, #0x1\n"
- "str h19, [x23, #0x0]\n"
- "str h18, [x23, #0x18]\n"
- "str h17, [x23, #0x30]\n"
- "str h16, [x23, #0x48]\n"
- "add x23, x23, #0x2\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "str h19, [x21, #0x0]\n"
+ "str h18, [x21, #0x18]\n"
+ "str h17, [x21, #0x30]\n"
+ "str h16, [x21, #0x48]\n"
+ "add x21, x21, #0x2\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x60\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
"ldr q19, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
"ldr q16, [x25], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "cmp x20, #0x18\n"
- "dup v17.2d, v16.d[1]\n"
+ "dup v18.2d, v16.d[1]\n"
+ "sub x20, x20, #0x18\n"
+ "ldr q17, [x25], #0x10\n"
"dup v16.2d, v16.d[0]\n"
- "str q19, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "mov v17.d[1], v18.d[0]\n"
- "dup v16.2d, v18.d[1]\n"
- "str q17, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "str q19, [x21, #0x0]\n"
+ "cmp x20, #0x18\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "mov v18.d[1], v17.d[0]\n"
+ "dup v16.2d, v17.d[1]\n"
+ "str q18, [x21, #0x0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr q17, [x25], #0x10\n"
- "sub x20, x20, #0xc\n"
"ldr d16, [x25], #0x8\n"
+ "sub x20, x20, #0xc\n"
"cmp x20, #0xc\n"
- "str q17, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q17, [x21, #0x0]\n"
+ "str d16, [x21, #0x10]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str d16, [x23, #0x10]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d16, [x25], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
- "str d16, [x23, #0x0]\n"
- "add x23, x23, #0x8\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h16, [x25], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
- "str h16, [x23, #0x0]\n"
- "add x23, x23, #0x2\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str h16, [x21, #0x0]\n"
+ "add x21, x21, #0x2\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x18\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
index 3e0ab6d955..d8edd806eb 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
@@ -40,675 +40,634 @@ void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, si
__asm__ __volatile__(
"cmp %x[height], #0x8\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
- "mov x28, %x[width]\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x9, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "cmp x28, #0x18\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
- "ldr q28, [x9], #0x10\n"
- "ldr q17, [x26], #0x10\n"
- "sub x28, x28, #0x18\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q15, [x24], #0x10\n"
- "cmp x28, #0x18\n"
- "ldr q20, [x23], #0x10\n"
- "ldr q30, [x22], #0x10\n"
- "ldr q9, [x21], #0x10\n"
- "ldr q14, [x20], #0x10\n"
+ "ldr q15, [x9], #0x10\n"
+ "ldr q1, [x28], #0x10\n"
+ "sub x24, x24, #0x18\n"
+ "cmp x24, #0x18\n"
+ "ldr q0, [x27], #0x10\n"
+ "ldr q27, [x26], #0x10\n"
+ "zip1 v18.4s, v15.4s, v0.4s\n"
+ "zip1 v20.4s, v1.4s, v27.4s\n"
+ "ldr q13, [x25], #0x10\n"
+ "ldr q14, [x23], #0x10\n"
+ "zip2 v16.4s, v15.4s, v0.4s\n"
+ "zip2 v3.4s, v1.4s, v27.4s\n"
+ "ldr q12, [x22], #0x10\n"
+ "ldr q11, [x20], #0x10\n"
+ "zip1 v4.4s, v13.4s, v12.4s\n"
+ "zip1 v28.4s, v14.4s, v11.4s\n"
+ "ldr q5, [x9], #0x10\n"
+ "ldr q30, [x28], #0x10\n"
+ "zip2 v23.4s, v13.4s, v12.4s\n"
+ "zip2 v19.4s, v14.4s, v11.4s\n"
+ "ldr q25, [x27], #0x10\n"
+ "ldr q11, [x26], #0x10\n"
+ "zip1 v21.4s, v5.4s, v25.4s\n"
+ "zip1 v14.4s, v30.4s, v11.4s\n"
+ "ldr q6, [x25], #0x10\n"
+ "ldr q27, [x23], #0x10\n"
+ "zip2 v29.4s, v5.4s, v25.4s\n"
+ "zip2 v17.4s, v30.4s, v11.4s\n"
+ "ldr q2, [x22], #0x10\n"
+ "ldr q10, [x20], #0x10\n"
+ "zip1 v11.4s, v6.4s, v2.4s\n"
+ "zip1 v1.4s, v27.4s, v10.4s\n"
+ "ldr q8, [x9], #0x10\n"
+ "ldr q5, [x28], #0x10\n"
+ "zip2 v24.4s, v6.4s, v2.4s\n"
+ "zip2 v0.4s, v27.4s, v10.4s\n"
+ "ldr q6, [x27], #0x10\n"
+ "ldr q31, [x26], #0x10\n"
+ "zip1 v12.4s, v8.4s, v6.4s\n"
+ "zip1 v10.4s, v5.4s, v31.4s\n"
+ "ldr q30, [x25], #0x10\n"
+ "ldr q2, [x23], #0x10\n"
+ "zip2 v9.4s, v8.4s, v6.4s\n"
+ "zip2 v13.4s, v5.4s, v31.4s\n"
+ "ldr q7, [x22], #0x10\n"
+ "ldr q8, [x20], #0x10\n"
+ "zip1 v27.4s, v30.4s, v7.4s\n"
+ "zip1 v31.4s, v2.4s, v8.4s\n"
+ "ldr q5, [x9], #0x10\n"
+ "ldr q26, [x28], #0x10\n"
+ "zip2 v22.4s, v30.4s, v7.4s\n"
+ "zip2 v8.4s, v2.4s, v8.4s\n"
+ "ldr q2, [x27], #0x10\n"
+ "ldr q6, [x26], #0x10\n"
+ "zip1 v25.4s, v5.4s, v2.4s\n"
+ "zip1 v15.4s, v26.4s, v6.4s\n"
+ "ldr q7, [x25], #0x10\n"
+ "ldr q30, [x23], #0x10\n"
+ "zip2 v5.4s, v5.4s, v2.4s\n"
+ "zip2 v26.4s, v26.4s, v6.4s\n"
+ "ldr q2, [x22], #0x10\n"
+ "zip1 v6.4s, v7.4s, v2.4s\n"
+ "zip2 v7.4s, v7.4s, v2.4s\n"
+ "zip1 v2.4s, v18.4s, v20.4s\n"
+ ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
+ "zip2 v20.4s, v18.4s, v20.4s\n"
+ "ldr q18, [x20], #0x10\n"
+ ".inst 0x4ea16a82 // bfcvtn2 v2.8h, v20.4s\n"
+ "zip1 v20.4s, v30.4s, v18.4s\n"
+ "zip2 v18.4s, v30.4s, v18.4s\n"
+ "zip1 v30.4s, v16.4s, v3.4s\n"
+ ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
+ "zip2 v3.4s, v16.4s, v3.4s\n"
"ldr q16, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "zip1 v27.4s, v28.4s, v19.4s\n"
- "zip1 v7.4s, v17.4s, v15.4s\n"
- "ldr q12, [x25], #0x10\n"
- "ldr q24, [x24], #0x10\n"
- "zip2 v6.4s, v28.4s, v19.4s\n"
- "zip2 v26.4s, v17.4s, v15.4s\n"
- "ldr q18, [x23], #0x10\n"
- "ldr q23, [x22], #0x10\n"
- "zip1 v25.4s, v20.4s, v9.4s\n"
- "zip1 v11.4s, v30.4s, v14.4s\n"
- "ldr q2, [x21], #0x10\n"
- "ldr q0, [x20], #0x10\n"
- "zip2 v28.4s, v20.4s, v9.4s\n"
- "zip2 v31.4s, v30.4s, v14.4s\n"
- "ldr q13, [x9], #0x10\n"
- "ldr q10, [x26], #0x10\n"
- "zip1 v14.4s, v16.4s, v12.4s\n"
- "zip1 v15.4s, v22.4s, v24.4s\n"
- "ldr q1, [x25], #0x10\n"
- "ldr q30, [x24], #0x10\n"
- "zip2 v3.4s, v16.4s, v12.4s\n"
- "zip2 v20.4s, v22.4s, v24.4s\n"
- "ldr q17, [x23], #0x10\n"
- "ldr q9, [x22], #0x10\n"
- "zip1 v21.4s, v18.4s, v2.4s\n"
- "zip1 v22.4s, v23.4s, v0.4s\n"
- "ldr q5, [x21], #0x10\n"
- "ldr q4, [x20], #0x10\n"
- "zip2 v16.4s, v18.4s, v2.4s\n"
- "zip2 v12.4s, v23.4s, v0.4s\n"
- "ldr q23, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "zip1 v2.4s, v13.4s, v1.4s\n"
- "zip1 v18.4s, v10.4s, v30.4s\n"
- "ldr q29, [x25], #0x10\n"
- "ldr q0, [x24], #0x10\n"
- "zip2 v1.4s, v13.4s, v1.4s\n"
- "zip2 v30.4s, v10.4s, v30.4s\n"
- "ldr q13, [x23], #0x10\n"
- "ldr q10, [x22], #0x10\n"
- "zip1 v8.4s, v17.4s, v5.4s\n"
- "zip1 v19.4s, v9.4s, v4.4s\n"
- "zip2 v5.4s, v17.4s, v5.4s\n"
- "ldr q17, [x21], #0x10\n"
- "zip2 v9.4s, v9.4s, v4.4s\n"
- "zip1 v4.4s, v23.4s, v29.4s\n"
- "zip2 v23.4s, v23.4s, v29.4s\n"
- "zip1 v29.4s, v24.4s, v0.4s\n"
- "zip2 v24.4s, v24.4s, v0.4s\n"
- "zip1 v0.4s, v13.4s, v17.4s\n"
- "zip2 v17.4s, v13.4s, v17.4s\n"
- "zip1 v13.4s, v27.4s, v7.4s\n"
- "zip2 v7.4s, v27.4s, v7.4s\n"
- "ldr q27, [x20], #0x10\n"
+ ".inst 0x4ea1687e // bfcvtn2 v30.8h, v3.4s\n"
+ "zip1 v3.4s, v21.4s, v14.4s\n"
+ ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
+ "zip2 v21.4s, v21.4s, v14.4s\n"
+ "ldr q14, [x28], #0x10\n"
+ ".inst 0x4ea16aa3 // bfcvtn2 v3.8h, v21.4s\n"
+ "zip1 v21.4s, v29.4s, v17.4s\n"
+ ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n"
+ "zip2 v29.4s, v29.4s, v17.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ ".inst 0x4ea16bb5 // bfcvtn2 v21.8h, v29.4s\n"
+ "zip1 v29.4s, v16.4s, v17.4s\n"
+ "zip2 v16.4s, v16.4s, v17.4s\n"
+ "zip1 v17.4s, v12.4s, v10.4s\n"
+ ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ "zip2 v10.4s, v12.4s, v10.4s\n"
+ "ldr q12, [x26], #0x10\n"
+ ".inst 0x4ea16951 // bfcvtn2 v17.8h, v10.4s\n"
+ "zip1 v10.4s, v14.4s, v12.4s\n"
+ "zip2 v14.4s, v14.4s, v12.4s\n"
+ "zip1 v12.4s, v9.4s, v13.4s\n"
+ ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n"
+ "zip2 v13.4s, v9.4s, v13.4s\n"
+ "ldr q9, [x25], #0x10\n"
+ ".inst 0x4ea169ac // bfcvtn2 v12.8h, v13.4s\n"
+ "zip1 v13.4s, v25.4s, v15.4s\n"
".inst 0x0ea169ad // bfcvtn v13.4h, v13.4s\n"
- ".inst 0x4ea168ed // bfcvtn2 v13.8h, v7.4s\n"
- "zip1 v7.4s, v6.4s, v26.4s\n"
- "zip2 v26.4s, v6.4s, v26.4s\n"
- "ldr q6, [x9], #0x10\n"
- ".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n"
- ".inst 0x4ea16b47 // bfcvtn2 v7.8h, v26.4s\n"
- "zip1 v26.4s, v10.4s, v27.4s\n"
- "zip2 v27.4s, v10.4s, v27.4s\n"
- "zip1 v10.4s, v14.4s, v15.4s\n"
- "zip2 v14.4s, v14.4s, v15.4s\n"
- "ldr q15, [x26], #0x10\n"
- ".inst 0x0ea1694a // bfcvtn v10.4h, v10.4s\n"
- ".inst 0x4ea169ca // bfcvtn2 v10.8h, v14.4s\n"
- "zip1 v14.4s, v3.4s, v20.4s\n"
- "zip2 v20.4s, v3.4s, v20.4s\n"
- "ldr q3, [x25], #0x10\n"
+ "zip2 v25.4s, v25.4s, v15.4s\n"
+ "ldr q15, [x23], #0x10\n"
+ ".inst 0x4ea16b2d // bfcvtn2 v13.8h, v25.4s\n"
+ "zip1 v25.4s, v5.4s, v26.4s\n"
+ ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n"
+ "zip2 v5.4s, v5.4s, v26.4s\n"
+ "ldr q26, [x22], #0x10\n"
+ ".inst 0x4ea168b9 // bfcvtn2 v25.8h, v5.4s\n"
+ "zip1 v5.4s, v9.4s, v26.4s\n"
+ "zip2 v9.4s, v9.4s, v26.4s\n"
+ "zip1 v26.4s, v29.4s, v10.4s\n"
+ ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n"
+ "zip2 v10.4s, v29.4s, v10.4s\n"
+ "ldr q29, [x20], #0x10\n"
+ ".inst 0x4ea1695a // bfcvtn2 v26.8h, v10.4s\n"
+ "zip1 v10.4s, v15.4s, v29.4s\n"
+ "zip2 v15.4s, v15.4s, v29.4s\n"
+ "zip1 v29.4s, v16.4s, v14.4s\n"
+ ".inst 0x0ea16bbd // bfcvtn v29.4h, v29.4s\n"
+ "zip2 v14.4s, v16.4s, v14.4s\n"
+ "ldr q16, [x9], #0x10\n"
+ ".inst 0x4ea169dd // bfcvtn2 v29.8h, v14.4s\n"
+ "zip1 v14.4s, v4.4s, v28.4s\n"
".inst 0x0ea169ce // bfcvtn v14.4h, v14.4s\n"
- ".inst 0x4ea16a8e // bfcvtn2 v14.8h, v20.4s\n"
- "zip1 v20.4s, v2.4s, v18.4s\n"
- "zip2 v18.4s, v2.4s, v18.4s\n"
- "ldr q2, [x24], #0x10\n"
- ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
- ".inst 0x4ea16a54 // bfcvtn2 v20.8h, v18.4s\n"
- "zip1 v18.4s, v6.4s, v3.4s\n"
- "zip2 v3.4s, v6.4s, v3.4s\n"
- "zip1 v6.4s, v15.4s, v2.4s\n"
- "zip2 v15.4s, v15.4s, v2.4s\n"
- "zip1 v2.4s, v1.4s, v30.4s\n"
- "zip2 v30.4s, v1.4s, v30.4s\n"
- "ldr q1, [x23], #0x10\n"
- ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- ".inst 0x4ea16bc2 // bfcvtn2 v2.8h, v30.4s\n"
- "zip1 v30.4s, v4.4s, v29.4s\n"
- "zip2 v4.4s, v4.4s, v29.4s\n"
- "ldr q29, [x22], #0x10\n"
- ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
- ".inst 0x4ea1689e // bfcvtn2 v30.8h, v4.4s\n"
- "zip1 v4.4s, v23.4s, v24.4s\n"
- "zip2 v24.4s, v23.4s, v24.4s\n"
- "ldr q23, [x21], #0x10\n"
+ "zip2 v4.4s, v4.4s, v28.4s\n"
+ "ldr q28, [x28], #0x10\n"
+ ".inst 0x4ea1688e // bfcvtn2 v14.8h, v4.4s\n"
+ "zip1 v4.4s, v23.4s, v19.4s\n"
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
- ".inst 0x4ea16b04 // bfcvtn2 v4.8h, v24.4s\n"
- "zip1 v24.4s, v18.4s, v6.4s\n"
- "zip2 v18.4s, v18.4s, v6.4s\n"
- "ldr q6, [x20], #0x10\n"
- ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n"
- ".inst 0x4ea16a58 // bfcvtn2 v24.8h, v18.4s\n"
- "zip1 v18.4s, v1.4s, v23.4s\n"
- "zip2 v1.4s, v1.4s, v23.4s\n"
- "zip1 v23.4s, v29.4s, v6.4s\n"
- "zip2 v29.4s, v29.4s, v6.4s\n"
- "zip1 v6.4s, v3.4s, v15.4s\n"
- "zip2 v3.4s, v3.4s, v15.4s\n"
- "ldr q15, [x9], #0x10\n"
- ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
- ".inst 0x4ea16866 // bfcvtn2 v6.8h, v3.4s\n"
- "zip1 v3.4s, v25.4s, v11.4s\n"
- "zip2 v11.4s, v25.4s, v11.4s\n"
- "ldr q25, [x26], #0x10\n"
- ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
- ".inst 0x4ea16963 // bfcvtn2 v3.8h, v11.4s\n"
- "zip1 v11.4s, v28.4s, v31.4s\n"
- "zip2 v28.4s, v28.4s, v31.4s\n"
- "ldr q31, [x25], #0x10\n"
+ "zip2 v19.4s, v23.4s, v19.4s\n"
+ "ldr q23, [x27], #0x10\n"
+ ".inst 0x4ea16a64 // bfcvtn2 v4.8h, v19.4s\n"
+ "zip1 v19.4s, v16.4s, v23.4s\n"
+ "zip2 v16.4s, v16.4s, v23.4s\n"
+ "zip1 v23.4s, v11.4s, v1.4s\n"
+ ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
+ "zip2 v1.4s, v11.4s, v1.4s\n"
+ "ldr q11, [x26], #0x10\n"
+ ".inst 0x4ea16837 // bfcvtn2 v23.8h, v1.4s\n"
+ "zip1 v1.4s, v28.4s, v11.4s\n"
+ "zip2 v28.4s, v28.4s, v11.4s\n"
+ "zip1 v11.4s, v19.4s, v1.4s\n"
".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
- ".inst 0x4ea16b8b // bfcvtn2 v11.8h, v28.4s\n"
- "zip1 v28.4s, v21.4s, v22.4s\n"
- "zip2 v21.4s, v21.4s, v22.4s\n"
- "ldr q22, [x24], #0x10\n"
- ".inst 0x0ea16b9c // bfcvtn v28.4h, v28.4s\n"
- ".inst 0x4ea16abc // bfcvtn2 v28.8h, v21.4s\n"
- "zip1 v21.4s, v15.4s, v31.4s\n"
- "zip2 v31.4s, v15.4s, v31.4s\n"
- "zip1 v15.4s, v25.4s, v22.4s\n"
- "zip2 v22.4s, v25.4s, v22.4s\n"
- "zip1 v25.4s, v16.4s, v12.4s\n"
- "zip2 v16.4s, v16.4s, v12.4s\n"
- "ldr q12, [x23], #0x10\n"
- ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n"
- ".inst 0x4ea16a19 // bfcvtn2 v25.8h, v16.4s\n"
- "zip1 v16.4s, v21.4s, v15.4s\n"
- "zip2 v21.4s, v21.4s, v15.4s\n"
- "ldr q15, [x22], #0x10\n"
+ "zip2 v19.4s, v19.4s, v1.4s\n"
+ "ldr q1, [x25], #0x10\n"
+ ".inst 0x4ea16a6b // bfcvtn2 v11.8h, v19.4s\n"
+ "zip1 v19.4s, v16.4s, v28.4s\n"
+ ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
+ "zip2 v16.4s, v16.4s, v28.4s\n"
+ "ldr q28, [x23], #0x10\n"
+ ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n"
+ "zip1 v16.4s, v24.4s, v0.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- ".inst 0x4ea16ab0 // bfcvtn2 v16.8h, v21.4s\n"
- "zip1 v21.4s, v31.4s, v22.4s\n"
- "zip2 v22.4s, v31.4s, v22.4s\n"
- "ldr q31, [x21], #0x10\n"
- ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n"
- ".inst 0x4ea16ad5 // bfcvtn2 v21.8h, v22.4s\n"
- "ldr q22, [x20], #0x10\n"
- "str q13, [x27, #0x0]\n"
- "zip1 v13.4s, v8.4s, v19.4s\n"
- "zip2 v19.4s, v8.4s, v19.4s\n"
- "str q7, [x27, #0x10]\n"
- "zip1 v8.4s, v12.4s, v31.4s\n"
- "zip2 v12.4s, v12.4s, v31.4s\n"
- "str q10, [x27, #0x20]\n"
- "zip1 v7.4s, v15.4s, v22.4s\n"
- "zip2 v15.4s, v15.4s, v22.4s\n"
- "str q14, [x27, #0x30]\n"
- "zip1 v14.4s, v5.4s, v9.4s\n"
- "zip1 v22.4s, v0.4s, v26.4s\n"
- "str q20, [x27, #0x40]\n"
- "zip1 v31.4s, v17.4s, v27.4s\n"
- "zip1 v10.4s, v18.4s, v23.4s\n"
- "str q2, [x27, #0x50]\n"
- "zip1 v20.4s, v1.4s, v29.4s\n"
- "zip1 v2.4s, v8.4s, v7.4s\n"
- "str q30, [x27, #0x60]\n"
- "zip1 v30.4s, v12.4s, v15.4s\n"
- ".inst 0x0ea169ad // bfcvtn v13.4h, v13.4s\n"
- "str q4, [x27, #0x70]\n"
- ".inst 0x0ea169c4 // bfcvtn v4.4h, v14.4s\n"
- "zip2 v9.4s, v5.4s, v9.4s\n"
- "str q24, [x27, #0x80]\n"
- ".inst 0x0ea16ad8 // bfcvtn v24.4h, v22.4s\n"
- "zip2 v26.4s, v0.4s, v26.4s\n"
- "str q6, [x27, #0x90]\n"
- ".inst 0x0ea16bee // bfcvtn v14.4h, v31.4s\n"
- "zip2 v5.4s, v17.4s, v27.4s\n"
- "str q16, [x27, #0xa0]\n"
- ".inst 0x0ea1695b // bfcvtn v27.4h, v10.4s\n"
- "zip2 v17.4s, v18.4s, v23.4s\n"
- "str q21, [x27, #0xb0]\n"
- ".inst 0x0ea16a9f // bfcvtn v31.4h, v20.4s\n"
- "zip2 v23.4s, v1.4s, v29.4s\n"
- "str q3, [x27, #0xc0]\n"
- ".inst 0x0ea16852 // bfcvtn v18.4h, v2.4s\n"
- "zip2 v20.4s, v8.4s, v7.4s\n"
- "str q11, [x27, #0xd0]\n"
- ".inst 0x0ea16bd0 // bfcvtn v16.4h, v30.4s\n"
- "zip2 v15.4s, v12.4s, v15.4s\n"
- "str q28, [x27, #0xe0]\n"
- ".inst 0x4ea16a6d // bfcvtn2 v13.8h, v19.4s\n"
- ".inst 0x4ea16924 // bfcvtn2 v4.8h, v9.4s\n"
- "str q25, [x27, #0xf0]\n"
- ".inst 0x4ea16b58 // bfcvtn2 v24.8h, v26.4s\n"
- ".inst 0x4ea168ae // bfcvtn2 v14.8h, v5.4s\n"
- ".inst 0x4ea16a3b // bfcvtn2 v27.8h, v17.4s\n"
- ".inst 0x4ea16aff // bfcvtn2 v31.8h, v23.4s\n"
- ".inst 0x4ea16a92 // bfcvtn2 v18.8h, v20.4s\n"
- ".inst 0x4ea169f0 // bfcvtn2 v16.8h, v15.4s\n"
- "str q13, [x27, #0x100]\n"
- "str q4, [x27, #0x110]\n"
- "str q24, [x27, #0x120]\n"
- "str q14, [x27, #0x130]\n"
- "str q27, [x27, #0x140]\n"
- "str q31, [x27, #0x150]\n"
- "str q18, [x27, #0x160]\n"
- "str q16, [x27, #0x170]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip2 v24.4s, v24.4s, v0.4s\n"
+ "ldr q0, [x22], #0x10\n"
+ ".inst 0x4ea16b10 // bfcvtn2 v16.8h, v24.4s\n"
+ "ldr q24, [x20], #0x10\n"
+ "str q2, [x21, #0x0]\n"
+ "zip1 v2.4s, v1.4s, v0.4s\n"
+ "zip2 v0.4s, v1.4s, v0.4s\n"
+ "zip1 v1.4s, v28.4s, v24.4s\n"
+ "zip2 v28.4s, v28.4s, v24.4s\n"
+ "str q30, [x21, #0x10]\n"
+ "zip1 v24.4s, v27.4s, v31.4s\n"
+ "zip1 v30.4s, v22.4s, v8.4s\n"
+ "str q3, [x21, #0x20]\n"
+ "zip1 v3.4s, v6.4s, v20.4s\n"
+ ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n"
+ "str q21, [x21, #0x30]\n"
+ "zip1 v21.4s, v7.4s, v18.4s\n"
+ "zip2 v31.4s, v27.4s, v31.4s\n"
+ "str q17, [x21, #0x40]\n"
+ "zip1 v17.4s, v5.4s, v10.4s\n"
+ "zip1 v27.4s, v9.4s, v15.4s\n"
+ "str q12, [x21, #0x50]\n"
+ "zip1 v12.4s, v2.4s, v1.4s\n"
+ ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
+ "str q13, [x21, #0x60]\n"
+ "zip1 v13.4s, v0.4s, v28.4s\n"
+ "zip2 v22.4s, v22.4s, v8.4s\n"
+ "str q25, [x21, #0x70]\n"
+ ".inst 0x0ea16879 // bfcvtn v25.4h, v3.4s\n"
+ "zip2 v8.4s, v6.4s, v20.4s\n"
+ "str q26, [x21, #0x80]\n"
+ ".inst 0x0ea16aa3 // bfcvtn v3.4h, v21.4s\n"
+ "zip2 v18.4s, v7.4s, v18.4s\n"
+ "str q29, [x21, #0x90]\n"
+ ".inst 0x0ea16a27 // bfcvtn v7.4h, v17.4s\n"
+ "zip2 v21.4s, v5.4s, v10.4s\n"
+ "str q11, [x21, #0xa0]\n"
+ ".inst 0x0ea16b65 // bfcvtn v5.4h, v27.4s\n"
+ "zip2 v15.4s, v9.4s, v15.4s\n"
+ "str q19, [x21, #0xb0]\n"
+ ".inst 0x0ea16991 // bfcvtn v17.4h, v12.4s\n"
+ "zip2 v20.4s, v2.4s, v1.4s\n"
+ "str q14, [x21, #0xc0]\n"
+ ".inst 0x0ea169bb // bfcvtn v27.4h, v13.4s\n"
+ "zip2 v29.4s, v0.4s, v28.4s\n"
+ "str q4, [x21, #0xd0]\n"
+ ".inst 0x4ea16bf8 // bfcvtn2 v24.8h, v31.4s\n"
+ ".inst 0x4ea16ade // bfcvtn2 v30.8h, v22.4s\n"
+ "str q23, [x21, #0xe0]\n"
+ ".inst 0x4ea16919 // bfcvtn2 v25.8h, v8.4s\n"
+ ".inst 0x4ea16a43 // bfcvtn2 v3.8h, v18.4s\n"
+ "str q16, [x21, #0xf0]\n"
+ ".inst 0x4ea16aa7 // bfcvtn2 v7.8h, v21.4s\n"
+ ".inst 0x4ea169e5 // bfcvtn2 v5.8h, v15.4s\n"
+ "str q24, [x21, #0x100]\n"
+ ".inst 0x4ea16a91 // bfcvtn2 v17.8h, v20.4s\n"
+ ".inst 0x4ea16bbb // bfcvtn2 v27.8h, v29.4s\n"
+ "str q30, [x21, #0x110]\n"
+ "str q25, [x21, #0x120]\n"
+ "str q3, [x21, #0x130]\n"
+ "str q7, [x21, #0x140]\n"
+ "str q5, [x21, #0x150]\n"
+ "str q17, [x21, #0x160]\n"
+ "str q27, [x21, #0x170]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x28, 10f\n"
- "cmp x28, #0x10\n"
- "movi v6.16b, #0x0\n"
- "str q6, [x27, #0x0]\n"
- "str q6, [x27, #0x10]\n"
- "str q6, [x27, #0x20]\n"
- "str q6, [x27, #0x30]\n"
- "str q6, [x27, #0x40]\n"
- "str q6, [x27, #0x50]\n"
- "str q6, [x27, #0x60]\n"
- "str q6, [x27, #0x70]\n"
- "str q6, [x27, #0x80]\n"
- "str q6, [x27, #0x90]\n"
- "str q6, [x27, #0xa0]\n"
- "str q6, [x27, #0xb0]\n"
- "str q6, [x27, #0xc0]\n"
- "str q6, [x27, #0xd0]\n"
- "str q6, [x27, #0xe0]\n"
- "str q6, [x27, #0xf0]\n"
- "str q6, [x27, #0x100]\n"
- "str q6, [x27, #0x110]\n"
- "str q6, [x27, #0x120]\n"
- "str q6, [x27, #0x130]\n"
- "str q6, [x27, #0x140]\n"
- "str q6, [x27, #0x150]\n"
- "str q6, [x27, #0x160]\n"
- "str q6, [x27, #0x170]\n"
+ "cmp x24, #0x10\n"
"blt 5f\n"
"4:" // Main row loop: width 16 loop: loop
- "ldr q19, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "sub x28, x28, #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q21, [x24], #0x10\n"
- "cmp x28, #0x10\n"
- "ldr q15, [x23], #0x10\n"
- "ldr q20, [x22], #0x10\n"
- "ldr q30, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "ldr q6, [x9], #0x10\n"
- "ldr q10, [x26], #0x10\n"
- "zip1 v4.4s, v19.4s, v18.4s\n"
- "zip1 v17.4s, v24.4s, v21.4s\n"
- "ldr q13, [x25], #0x10\n"
- "ldr q31, [x24], #0x10\n"
- "zip2 v29.4s, v19.4s, v18.4s\n"
- "zip2 v26.4s, v24.4s, v21.4s\n"
- "ldr q21, [x23], #0x10\n"
- "ldr q25, [x22], #0x10\n"
- "zip1 v18.4s, v15.4s, v30.4s\n"
- "zip1 v7.4s, v20.4s, v16.4s\n"
- "ldr q19, [x21], #0x10\n"
+ "ldr q9, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q15, [x27], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "zip1 v14.4s, v9.4s, v15.4s\n"
+ "zip1 v11.4s, v18.4s, v17.4s\n"
+ "ldr q7, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip2 v12.4s, v9.4s, v15.4s\n"
+ "zip2 v6.4s, v18.4s, v17.4s\n"
+ "ldr q15, [x22], #0x10\n"
"ldr q3, [x20], #0x10\n"
- "zip2 v24.4s, v15.4s, v30.4s\n"
- "zip2 v15.4s, v20.4s, v16.4s\n"
- "ldr q27, [x9], #0x10\n"
- "ldr q9, [x26], #0x10\n"
- "zip1 v2.4s, v6.4s, v13.4s\n"
- "zip1 v30.4s, v10.4s, v31.4s\n"
- "ldr q0, [x25], #0x10\n"
- "ldr q11, [x24], #0x10\n"
- "zip2 v28.4s, v6.4s, v13.4s\n"
- "zip2 v22.4s, v10.4s, v31.4s\n"
- "ldr q8, [x23], #0x10\n"
- "ldr q1, [x22], #0x10\n"
- "zip1 v23.4s, v21.4s, v19.4s\n"
- "zip1 v6.4s, v25.4s, v3.4s\n"
- "ldr q31, [x21], #0x10\n"
- "ldr q5, [x20], #0x10\n"
- "zip2 v20.4s, v21.4s, v19.4s\n"
- "zip2 v21.4s, v25.4s, v3.4s\n"
- "ldr q16, [x9], #0x10\n"
+ "zip1 v30.4s, v7.4s, v15.4s\n"
+ "zip1 v20.4s, v16.4s, v3.4s\n"
+ "ldr q17, [x9], #0x10\n"
+ "ldr q9, [x28], #0x10\n"
+ "zip2 v1.4s, v7.4s, v15.4s\n"
+ "zip2 v24.4s, v16.4s, v3.4s\n"
+ "ldr q10, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v0.4s, v17.4s, v10.4s\n"
+ "zip1 v8.4s, v9.4s, v16.4s\n"
+ "ldr q7, [x25], #0x10\n"
+ "ldr q2, [x23], #0x10\n"
+ "zip2 v17.4s, v17.4s, v10.4s\n"
+ "zip2 v3.4s, v9.4s, v16.4s\n"
+ "ldr q9, [x22], #0x10\n"
+ "ldr q10, [x20], #0x10\n"
+ "zip1 v25.4s, v7.4s, v9.4s\n"
+ "zip1 v23.4s, v2.4s, v10.4s\n"
+ "ldr q31, [x9], #0x10\n"
+ "ldr q21, [x28], #0x10\n"
+ "zip2 v16.4s, v7.4s, v9.4s\n"
+ "zip2 v27.4s, v2.4s, v10.4s\n"
+ "ldr q26, [x27], #0x10\n"
+ "ldr q19, [x26], #0x10\n"
+ "zip1 v2.4s, v31.4s, v26.4s\n"
+ "zip1 v7.4s, v21.4s, v19.4s\n"
+ "ldr q29, [x25], #0x10\n"
+ "ldr q13, [x23], #0x10\n"
+ "zip2 v31.4s, v31.4s, v26.4s\n"
+ "zip2 v19.4s, v21.4s, v19.4s\n"
+ "ldr q4, [x22], #0x10\n"
+ "ldr q18, [x20], #0x10\n"
+ "zip1 v26.4s, v29.4s, v4.4s\n"
+ "zip1 v15.4s, v13.4s, v18.4s\n"
+ "ldr q9, [x9], #0x10\n"
+ "ldr q22, [x28], #0x10\n"
+ "zip2 v4.4s, v29.4s, v4.4s\n"
+ "zip2 v18.4s, v13.4s, v18.4s\n"
+ "ldr q29, [x27], #0x10\n"
"ldr q10, [x26], #0x10\n"
- "zip1 v14.4s, v27.4s, v0.4s\n"
- "zip1 v3.4s, v9.4s, v11.4s\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q13, [x24], #0x10\n"
- "zip2 v25.4s, v27.4s, v0.4s\n"
- "zip2 v12.4s, v9.4s, v11.4s\n"
- "ldr q11, [x23], #0x10\n"
- "ldr q0, [x22], #0x10\n"
- "zip1 v27.4s, v8.4s, v31.4s\n"
- "zip1 v9.4s, v1.4s, v5.4s\n"
- "zip2 v31.4s, v8.4s, v31.4s\n"
- "ldr q8, [x21], #0x10\n"
- "zip2 v5.4s, v1.4s, v5.4s\n"
- "zip1 v1.4s, v16.4s, v19.4s\n"
- "zip2 v19.4s, v16.4s, v19.4s\n"
- "zip1 v16.4s, v10.4s, v13.4s\n"
- "zip2 v13.4s, v10.4s, v13.4s\n"
- "zip1 v10.4s, v11.4s, v8.4s\n"
- "zip2 v11.4s, v11.4s, v8.4s\n"
- "zip1 v8.4s, v4.4s, v17.4s\n"
- "zip2 v17.4s, v4.4s, v17.4s\n"
- "ldr q4, [x20], #0x10\n"
+ "zip1 v21.4s, v9.4s, v29.4s\n"
+ "zip1 v5.4s, v22.4s, v10.4s\n"
+ "ldr q28, [x25], #0x10\n"
+ "ldr q13, [x23], #0x10\n"
+ "zip2 v29.4s, v9.4s, v29.4s\n"
+ "zip2 v9.4s, v22.4s, v10.4s\n"
+ "ldr q22, [x22], #0x10\n"
+ "zip1 v10.4s, v28.4s, v22.4s\n"
+ "zip2 v28.4s, v28.4s, v22.4s\n"
+ "zip1 v22.4s, v14.4s, v11.4s\n"
+ ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n"
+ "zip2 v11.4s, v14.4s, v11.4s\n"
+ "ldr q14, [x20], #0x10\n"
+ ".inst 0x4ea16976 // bfcvtn2 v22.8h, v11.4s\n"
+ "str q22, [x21, #0x0]\n"
+ "zip1 v22.4s, v13.4s, v14.4s\n"
+ "zip2 v14.4s, v13.4s, v14.4s\n"
+ "zip1 v13.4s, v12.4s, v6.4s\n"
+ "zip1 v11.4s, v0.4s, v8.4s\n"
+ ".inst 0x0ea169ad // bfcvtn v13.4h, v13.4s\n"
+ "zip2 v12.4s, v12.4s, v6.4s\n"
+ "zip1 v6.4s, v17.4s, v3.4s\n"
+ ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
+ "zip2 v0.4s, v0.4s, v8.4s\n"
+ "zip1 v8.4s, v2.4s, v7.4s\n"
+ ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
+ "zip2 v3.4s, v17.4s, v3.4s\n"
+ "zip1 v17.4s, v31.4s, v19.4s\n"
".inst 0x0ea16908 // bfcvtn v8.4h, v8.4s\n"
- ".inst 0x4ea16a28 // bfcvtn2 v8.8h, v17.4s\n"
- "zip1 v17.4s, v29.4s, v26.4s\n"
- "zip2 v29.4s, v29.4s, v26.4s\n"
- "zip1 v26.4s, v0.4s, v4.4s\n"
- "zip2 v0.4s, v0.4s, v4.4s\n"
- "zip1 v4.4s, v2.4s, v30.4s\n"
- "zip2 v2.4s, v2.4s, v30.4s\n"
- "str q8, [x27, #0x0]\n"
- "zip1 v8.4s, v28.4s, v22.4s\n"
- "zip1 v30.4s, v14.4s, v3.4s\n"
+ "zip2 v2.4s, v2.4s, v7.4s\n"
+ "zip1 v7.4s, v21.4s, v5.4s\n"
".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip1 v28.4s, v25.4s, v12.4s\n"
- ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
- ".inst 0x0ea16908 // bfcvtn v8.4h, v8.4s\n"
- ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
- "zip2 v14.4s, v14.4s, v3.4s\n"
- "zip1 v3.4s, v1.4s, v16.4s\n"
- ".inst 0x0ea16b9c // bfcvtn v28.4h, v28.4s\n"
- "zip2 v25.4s, v25.4s, v12.4s\n"
- "zip1 v12.4s, v19.4s, v13.4s\n"
- "zip2 v16.4s, v1.4s, v16.4s\n"
- "zip1 v1.4s, v18.4s, v7.4s\n"
- ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
- "zip2 v13.4s, v19.4s, v13.4s\n"
- "zip1 v19.4s, v24.4s, v15.4s\n"
- ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n"
- "zip2 v7.4s, v18.4s, v7.4s\n"
- "zip1 v18.4s, v23.4s, v6.4s\n"
- ".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
+ "zip2 v31.4s, v31.4s, v19.4s\n"
+ "zip1 v19.4s, v29.4s, v9.4s\n"
+ ".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n"
+ "zip2 v21.4s, v21.4s, v5.4s\n"
+ "zip1 v5.4s, v30.4s, v20.4s\n"
".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
- "zip2 v15.4s, v24.4s, v15.4s\n"
- "zip1 v24.4s, v20.4s, v21.4s\n"
- "zip2 v23.4s, v23.4s, v6.4s\n"
- "zip1 v6.4s, v27.4s, v9.4s\n"
- ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
- "zip2 v21.4s, v20.4s, v21.4s\n"
- "zip1 v20.4s, v31.4s, v5.4s\n"
+ "zip2 v29.4s, v29.4s, v9.4s\n"
+ "zip1 v9.4s, v1.4s, v24.4s\n"
+ ".inst 0x0ea168a5 // bfcvtn v5.4h, v5.4s\n"
+ "zip2 v20.4s, v30.4s, v20.4s\n"
+ "zip1 v30.4s, v25.4s, v23.4s\n"
+ ".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n"
+ "zip2 v1.4s, v1.4s, v24.4s\n"
+ "zip1 v24.4s, v16.4s, v27.4s\n"
+ ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
+ "zip2 v23.4s, v25.4s, v23.4s\n"
+ "zip1 v25.4s, v26.4s, v15.4s\n"
".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n"
- "zip2 v27.4s, v27.4s, v9.4s\n"
- "zip1 v9.4s, v10.4s, v26.4s\n"
- ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
- ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
- "zip2 v31.4s, v31.4s, v5.4s\n"
- "zip1 v5.4s, v11.4s, v0.4s\n"
- "zip2 v26.4s, v10.4s, v26.4s\n"
- ".inst 0x0ea1692a // bfcvtn v10.4h, v9.4s\n"
- "zip2 v0.4s, v11.4s, v0.4s\n"
- ".inst 0x4ea16bb1 // bfcvtn2 v17.8h, v29.4s\n"
- ".inst 0x4ea16844 // bfcvtn2 v4.8h, v2.4s\n"
- ".inst 0x0ea168a9 // bfcvtn v9.4h, v5.4s\n"
- ".inst 0x4ea16ac8 // bfcvtn2 v8.8h, v22.4s\n"
- ".inst 0x4ea169de // bfcvtn2 v30.8h, v14.4s\n"
- ".inst 0x4ea16b3c // bfcvtn2 v28.8h, v25.4s\n"
- ".inst 0x4ea16a03 // bfcvtn2 v3.8h, v16.4s\n"
- ".inst 0x4ea169ac // bfcvtn2 v12.8h, v13.4s\n"
- "str q17, [x27, #0x10]\n"
- ".inst 0x4ea168e1 // bfcvtn2 v1.8h, v7.4s\n"
- ".inst 0x4ea169f3 // bfcvtn2 v19.8h, v15.4s\n"
- "str q4, [x27, #0x20]\n"
- ".inst 0x4ea16af2 // bfcvtn2 v18.8h, v23.4s\n"
- ".inst 0x4ea16ab8 // bfcvtn2 v24.8h, v21.4s\n"
- "str q8, [x27, #0x30]\n"
- ".inst 0x4ea16b66 // bfcvtn2 v6.8h, v27.4s\n"
- ".inst 0x4ea16bf4 // bfcvtn2 v20.8h, v31.4s\n"
- "str q30, [x27, #0x40]\n"
- ".inst 0x4ea16b4a // bfcvtn2 v10.8h, v26.4s\n"
- ".inst 0x4ea16809 // bfcvtn2 v9.8h, v0.4s\n"
- "str q28, [x27, #0x50]\n"
- "str q3, [x27, #0x60]\n"
- "str q12, [x27, #0x70]\n"
- "str q1, [x27, #0xc0]\n"
- "str q19, [x27, #0xd0]\n"
- "str q18, [x27, #0xe0]\n"
- "str q24, [x27, #0xf0]\n"
- "str q6, [x27, #0x100]\n"
- "str q20, [x27, #0x110]\n"
- "str q10, [x27, #0x120]\n"
- "str q9, [x27, #0x130]\n"
- "add x27, x27, #0x80\n"
+ "zip2 v27.4s, v16.4s, v27.4s\n"
+ "zip1 v16.4s, v4.4s, v18.4s\n"
+ ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n"
+ "zip2 v15.4s, v26.4s, v15.4s\n"
+ "zip1 v26.4s, v10.4s, v22.4s\n"
+ ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ "zip2 v18.4s, v4.4s, v18.4s\n"
+ "zip1 v4.4s, v28.4s, v14.4s\n"
+ ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n"
+ "zip2 v10.4s, v10.4s, v22.4s\n"
+ ".inst 0x0ea16896 // bfcvtn v22.4h, v4.4s\n"
+ "zip2 v4.4s, v28.4s, v14.4s\n"
+ ".inst 0x4ea1698d // bfcvtn2 v13.8h, v12.4s\n"
+ "str q13, [x21, #0x10]\n"
+ ".inst 0x4ea1680b // bfcvtn2 v11.8h, v0.4s\n"
+ ".inst 0x4ea16866 // bfcvtn2 v6.8h, v3.4s\n"
+ "str q11, [x21, #0x20]\n"
+ ".inst 0x4ea16848 // bfcvtn2 v8.8h, v2.4s\n"
+ ".inst 0x4ea16bf1 // bfcvtn2 v17.8h, v31.4s\n"
+ "str q6, [x21, #0x30]\n"
+ ".inst 0x4ea16aa7 // bfcvtn2 v7.8h, v21.4s\n"
+ ".inst 0x4ea16bb3 // bfcvtn2 v19.8h, v29.4s\n"
+ "str q8, [x21, #0x40]\n"
+ ".inst 0x4ea16a85 // bfcvtn2 v5.8h, v20.4s\n"
+ ".inst 0x4ea16829 // bfcvtn2 v9.8h, v1.4s\n"
+ "str q17, [x21, #0x50]\n"
+ ".inst 0x4ea16afe // bfcvtn2 v30.8h, v23.4s\n"
+ ".inst 0x4ea16b78 // bfcvtn2 v24.8h, v27.4s\n"
+ "str q7, [x21, #0x60]\n"
+ ".inst 0x4ea169f9 // bfcvtn2 v25.8h, v15.4s\n"
+ ".inst 0x4ea16a50 // bfcvtn2 v16.8h, v18.4s\n"
+ "str q19, [x21, #0x70]\n"
+ ".inst 0x4ea1695a // bfcvtn2 v26.8h, v10.4s\n"
+ ".inst 0x4ea16896 // bfcvtn2 v22.8h, v4.4s\n"
+ "str q5, [x21, #0xc0]\n"
+ "str q9, [x21, #0xd0]\n"
+ "str q30, [x21, #0xe0]\n"
+ "str q24, [x21, #0xf0]\n"
+ "str q25, [x21, #0x100]\n"
+ "str q16, [x21, #0x110]\n"
+ "str q26, [x21, #0x120]\n"
+ "str q22, [x21, #0x130]\n"
+ "add x21, x21, #0x80\n"
"bge 4b\n"
"5:" // Main row loop: width 16 loop: skip
- "cmp x28, #0x4\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr q25, [x9], #0x10\n"
- "ldr q24, [x26], #0x10\n"
- "sub x28, x28, #0x4\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "cmp x28, #0x4\n"
- "ldr q23, [x23], #0x10\n"
- "ldr q19, [x22], #0x10\n"
- "ldr q18, [x21], #0x10\n"
- "ldr q17, [x20], #0x10\n"
- "zip1 v22.4s, v25.4s, v21.4s\n"
- "zip1 v16.4s, v24.4s, v20.4s\n"
- "zip2 v21.4s, v25.4s, v21.4s\n"
- "zip2 v20.4s, v24.4s, v20.4s\n"
- "zip1 v27.4s, v23.4s, v18.4s\n"
- "zip1 v26.4s, v19.4s, v17.4s\n"
- "zip2 v25.4s, v23.4s, v18.4s\n"
- "zip2 v24.4s, v19.4s, v17.4s\n"
- "zip1 v19.4s, v22.4s, v16.4s\n"
- "zip1 v18.4s, v21.4s, v20.4s\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v23.4s, v17.4s\n"
+ "zip1 v21.4s, v20.4s, v16.4s\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v28.4s, v23.4s, v17.4s\n"
+ "zip2 v20.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v27.4s, v19.4s, v17.4s\n"
+ "zip1 v26.4s, v18.4s, v16.4s\n"
+ "zip2 v25.4s, v19.4s, v17.4s\n"
+ "zip2 v24.4s, v18.4s, v16.4s\n"
+ "zip1 v19.4s, v22.4s, v21.4s\n"
+ "zip1 v18.4s, v28.4s, v20.4s\n"
"zip1 v17.4s, v27.4s, v26.4s\n"
- "zip2 v23.4s, v22.4s, v16.4s\n"
"zip1 v16.4s, v25.4s, v24.4s\n"
- "zip2 v22.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
- ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
+ ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
+ "zip2 v22.4s, v22.4s, v21.4s\n"
+ ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
+ "zip2 v20.4s, v28.4s, v20.4s\n"
".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
"zip2 v18.4s, v27.4s, v26.4s\n"
".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
"zip2 v16.4s, v25.4s, v24.4s\n"
- ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
- ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
+ ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
+ ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q23, [x21, #0x0]\n"
".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q21, [x27, #0x0]\n"
- "str q20, [x27, #0x10]\n"
- "str q19, [x27, #0xc0]\n"
- "str q17, [x27, #0xd0]\n"
- "add x27, x27, #0x20\n"
+ "str q21, [x21, #0x10]\n"
+ "str q19, [x21, #0xc0]\n"
+ "str q17, [x21, #0xd0]\n"
+ "add x21, x21, #0x20\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x28, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "sub x28, x28, #0x1\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s17, [x24], #0x4\n"
- "cmp x28, #0x1\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v19.4s, v23.4s, v19.4s\n"
- "zip1 v17.4s, v22.4s, v17.4s\n"
- "zip1 v18.4s, v21.4s, v18.4s\n"
- "zip1 v16.4s, v20.4s, v16.4s\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.4s, v19.4s, v17.4s\n"
"zip1 v16.4s, v18.4s, v16.4s\n"
- ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
+ ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "zip1 v17.4s, v20.4s, v17.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- "str d17, [x27, #0x0]\n"
- "str d16, [x27, #0xc0]\n"
- "add x27, x27, #0x8\n"
+ "str d18, [x21, #0x0]\n"
+ "str d16, [x21, #0xc0]\n"
+ "add x21, x21, #0x8\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x8\n"
"add %x[out], %x[out], #0x180\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x9, %x[in]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x26, x27, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x9, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "csel x25, x25, %x[pad_row], GE\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x26, %x[in_stride]\n"
"csel x26, x26, %x[pad_row], GT\n"
+ "csel x27, x27, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x28, x28, %x[pad_row], GT\n"
"cmp x20, #0x18\n"
- "blt 14f\n"
- "13:" // Tail row loop: Column loop
- "ldr q24, [x9], #0x10\n"
- "ldr q22, [x26], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
+ "ldr q20, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
"sub x20, x20, #0x18\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x18\n"
- "ldr q26, [x9], #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v9.4s, v20.4s, v17.4s\n"
+ "zip1 v30.4s, v18.4s, v16.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q19, [x28], #0x10\n"
+ "zip2 v17.4s, v20.4s, v17.4s\n"
+ "zip2 v5.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v0.4s, v21.4s, v18.4s\n"
+ "zip1 v3.4s, v19.4s, v16.4s\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v1.4s, v21.4s, v18.4s\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "ldr q19, [x27], #0x10\n"
+ "ldr q18, [x26], #0x10\n"
+ "zip1 v4.4s, v23.4s, v19.4s\n"
+ "zip1 v2.4s, v20.4s, v18.4s\n"
+ "ldr q22, [x9], #0x10\n"
+ "ldr q21, [x28], #0x10\n"
+ "zip2 v27.4s, v23.4s, v19.4s\n"
+ "zip2 v28.4s, v20.4s, v18.4s\n"
+ "ldr q20, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v25.4s, v22.4s, v20.4s\n"
+ "zip1 v26.4s, v21.4s, v24.4s\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v14.4s, v22.4s, v20.4s\n"
+ "zip2 v12.4s, v21.4s, v24.4s\n"
+ "ldr q31, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v15.4s, v19.4s, v31.4s\n"
+ "zip1 v13.4s, v18.4s, v24.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q11, [x28], #0x10\n"
+ "zip2 v20.4s, v19.4s, v31.4s\n"
+ "zip2 v10.4s, v18.4s, v24.4s\n"
+ "ldr q22, [x27], #0x10\n"
"ldr q23, [x26], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "ldr q25, [x9], #0x10\n"
- "zip1 v6.4s, v24.4s, v19.4s\n"
- "zip1 v11.4s, v22.4s, v17.4s\n"
- "ldr q21, [x26], #0x10\n"
- "ldr q20, [x25], #0x10\n"
- "zip2 v2.4s, v24.4s, v19.4s\n"
- "zip2 v22.4s, v22.4s, v17.4s\n"
- "ldr q17, [x24], #0x10\n"
- "ldr q7, [x9], #0x10\n"
- "zip1 v19.4s, v26.4s, v18.4s\n"
- "zip1 v5.4s, v23.4s, v16.4s\n"
- "ldr q3, [x26], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "zip2 v0.4s, v26.4s, v18.4s\n"
- "zip2 v4.4s, v23.4s, v16.4s\n"
- "ldr q16, [x24], #0x10\n"
- "ldr q1, [x9], #0x10\n"
- "zip1 v31.4s, v25.4s, v20.4s\n"
- "zip1 v23.4s, v21.4s, v17.4s\n"
- "ldr q30, [x26], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "zip2 v29.4s, v25.4s, v20.4s\n"
- "zip2 v28.4s, v21.4s, v17.4s\n"
- "ldr q25, [x24], #0x10\n"
- "ldr q20, [x9], #0x10\n"
- "zip1 v21.4s, v7.4s, v24.4s\n"
- "zip1 v26.4s, v3.4s, v16.4s\n"
- "ldr q17, [x26], #0x10\n"
- "ldr q27, [x25], #0x10\n"
- "zip2 v24.4s, v7.4s, v24.4s\n"
- "zip2 v14.4s, v3.4s, v16.4s\n"
- "ldr q10, [x24], #0x10\n"
- "zip1 v16.4s, v1.4s, v18.4s\n"
- "zip1 v13.4s, v30.4s, v25.4s\n"
- "zip2 v18.4s, v1.4s, v18.4s\n"
- "zip2 v12.4s, v30.4s, v25.4s\n"
- "zip1 v15.4s, v20.4s, v27.4s\n"
- "zip1 v9.4s, v17.4s, v10.4s\n"
- "zip2 v8.4s, v20.4s, v27.4s\n"
- "zip2 v10.4s, v17.4s, v10.4s\n"
- "zip1 v17.4s, v6.4s, v11.4s\n"
- "zip1 v7.4s, v2.4s, v22.4s\n"
- "zip1 v3.4s, v19.4s, v5.4s\n"
- "zip1 v1.4s, v0.4s, v4.4s\n"
- "zip1 v30.4s, v31.4s, v23.4s\n"
- "zip1 v25.4s, v29.4s, v28.4s\n"
- "zip1 v27.4s, v21.4s, v26.4s\n"
- "zip1 v20.4s, v24.4s, v14.4s\n"
- ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
- "zip2 v6.4s, v6.4s, v11.4s\n"
- "zip1 v11.4s, v16.4s, v13.4s\n"
+ "zip1 v19.4s, v21.4s, v22.4s\n"
+ "zip1 v18.4s, v11.4s, v23.4s\n"
+ "zip2 v6.4s, v21.4s, v22.4s\n"
+ "zip2 v11.4s, v11.4s, v23.4s\n"
+ "zip1 v8.4s, v9.4s, v30.4s\n"
+ "zip1 v21.4s, v17.4s, v5.4s\n"
+ "zip1 v7.4s, v0.4s, v3.4s\n"
+ "zip1 v31.4s, v1.4s, v16.4s\n"
+ "zip1 v29.4s, v4.4s, v2.4s\n"
+ "zip1 v22.4s, v27.4s, v28.4s\n"
+ "zip1 v24.4s, v25.4s, v26.4s\n"
+ "zip1 v23.4s, v14.4s, v12.4s\n"
+ ".inst 0x0ea16908 // bfcvtn v8.4h, v8.4s\n"
+ "zip2 v9.4s, v9.4s, v30.4s\n"
+ "zip1 v30.4s, v15.4s, v13.4s\n"
+ ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n"
+ "zip2 v5.4s, v17.4s, v5.4s\n"
+ "zip1 v17.4s, v20.4s, v10.4s\n"
".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n"
- "zip2 v22.4s, v2.4s, v22.4s\n"
- "zip1 v2.4s, v18.4s, v12.4s\n"
- ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
- "zip2 v5.4s, v19.4s, v5.4s\n"
- "zip1 v19.4s, v15.4s, v9.4s\n"
- ".inst 0x0ea16821 // bfcvtn v1.4h, v1.4s\n"
- "zip2 v4.4s, v0.4s, v4.4s\n"
- "zip1 v0.4s, v8.4s, v10.4s\n"
- ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n"
- "zip2 v31.4s, v31.4s, v23.4s\n"
- ".inst 0x0ea16b37 // bfcvtn v23.4h, v25.4s\n"
- "zip2 v29.4s, v29.4s, v28.4s\n"
- ".inst 0x0ea16b7c // bfcvtn v28.4h, v27.4s\n"
- "zip2 v27.4s, v21.4s, v26.4s\n"
- ".inst 0x0ea16a9a // bfcvtn v26.4h, v20.4s\n"
- "zip2 v25.4s, v24.4s, v14.4s\n"
- ".inst 0x0ea16978 // bfcvtn v24.4h, v11.4s\n"
- "zip2 v14.4s, v16.4s, v13.4s\n"
- ".inst 0x0ea16855 // bfcvtn v21.4h, v2.4s\n"
- "zip2 v20.4s, v18.4s, v12.4s\n"
- ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
- "zip2 v18.4s, v15.4s, v9.4s\n"
- ".inst 0x0ea1680b // bfcvtn v11.4h, v0.4s\n"
- "zip2 v16.4s, v8.4s, v10.4s\n"
- ".inst 0x4ea168d1 // bfcvtn2 v17.8h, v6.4s\n"
- ".inst 0x4ea16ac7 // bfcvtn2 v7.8h, v22.4s\n"
- ".inst 0x4ea168a3 // bfcvtn2 v3.8h, v5.4s\n"
- ".inst 0x4ea16881 // bfcvtn2 v1.8h, v4.4s\n"
- ".inst 0x4ea16bfe // bfcvtn2 v30.8h, v31.4s\n"
- ".inst 0x4ea16bb7 // bfcvtn2 v23.8h, v29.4s\n"
- ".inst 0x4ea16b7c // bfcvtn2 v28.8h, v27.4s\n"
- ".inst 0x4ea16b3a // bfcvtn2 v26.8h, v25.4s\n"
- "str q17, [x27, #0x0]\n"
- ".inst 0x4ea169d8 // bfcvtn2 v24.8h, v14.4s\n"
- ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
- "str q7, [x27, #0x10]\n"
- ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
- ".inst 0x4ea16a0b // bfcvtn2 v11.8h, v16.4s\n"
- "str q3, [x27, #0x20]\n"
- "str q1, [x27, #0x30]\n"
- "str q30, [x27, #0x40]\n"
- "str q23, [x27, #0x50]\n"
- "str q28, [x27, #0x60]\n"
- "str q26, [x27, #0x70]\n"
- "str q24, [x27, #0x80]\n"
- "str q21, [x27, #0x90]\n"
- "str q19, [x27, #0xa0]\n"
- "str q11, [x27, #0xb0]\n"
- "add x27, x27, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "zip2 v0.4s, v0.4s, v3.4s\n"
+ "zip1 v3.4s, v19.4s, v18.4s\n"
+ ".inst 0x0ea16bff // bfcvtn v31.4h, v31.4s\n"
+ "zip2 v16.4s, v1.4s, v16.4s\n"
+ "zip1 v1.4s, v6.4s, v11.4s\n"
+ ".inst 0x0ea16bbd // bfcvtn v29.4h, v29.4s\n"
+ "zip2 v2.4s, v4.4s, v2.4s\n"
+ ".inst 0x0ea16ac4 // bfcvtn v4.4h, v22.4s\n"
+ "zip2 v27.4s, v27.4s, v28.4s\n"
+ ".inst 0x0ea16b1c // bfcvtn v28.4h, v24.4s\n"
+ "zip2 v25.4s, v25.4s, v26.4s\n"
+ ".inst 0x0ea16afa // bfcvtn v26.4h, v23.4s\n"
+ "zip2 v14.4s, v14.4s, v12.4s\n"
+ ".inst 0x0ea16bd8 // bfcvtn v24.4h, v30.4s\n"
+ "zip2 v13.4s, v15.4s, v13.4s\n"
+ ".inst 0x0ea16a2f // bfcvtn v15.4h, v17.4s\n"
+ "zip2 v12.4s, v20.4s, v10.4s\n"
+ ".inst 0x0ea16874 // bfcvtn v20.4h, v3.4s\n"
+ "zip2 v10.4s, v19.4s, v18.4s\n"
+ ".inst 0x0ea16831 // bfcvtn v17.4h, v1.4s\n"
+ "zip2 v18.4s, v6.4s, v11.4s\n"
+ ".inst 0x4ea16928 // bfcvtn2 v8.8h, v9.4s\n"
+ ".inst 0x4ea168b5 // bfcvtn2 v21.8h, v5.4s\n"
+ "str q8, [x21, #0x0]\n"
+ ".inst 0x4ea16807 // bfcvtn2 v7.8h, v0.4s\n"
+ ".inst 0x4ea16a1f // bfcvtn2 v31.8h, v16.4s\n"
+ "str q21, [x21, #0x10]\n"
+ ".inst 0x4ea1685d // bfcvtn2 v29.8h, v2.4s\n"
+ ".inst 0x4ea16b64 // bfcvtn2 v4.8h, v27.4s\n"
+ "str q7, [x21, #0x20]\n"
+ ".inst 0x4ea16b3c // bfcvtn2 v28.8h, v25.4s\n"
+ ".inst 0x4ea169da // bfcvtn2 v26.8h, v14.4s\n"
+ "str q31, [x21, #0x30]\n"
+ ".inst 0x4ea169b8 // bfcvtn2 v24.8h, v13.4s\n"
+ ".inst 0x4ea1698f // bfcvtn2 v15.8h, v12.4s\n"
+ "str q29, [x21, #0x40]\n"
+ ".inst 0x4ea16954 // bfcvtn2 v20.8h, v10.4s\n"
+ ".inst 0x4ea16a51 // bfcvtn2 v17.8h, v18.4s\n"
+ "str q4, [x21, #0x50]\n"
+ "str q28, [x21, #0x60]\n"
+ "str q26, [x21, #0x70]\n"
+ "str q24, [x21, #0x80]\n"
+ "str q15, [x21, #0x90]\n"
+ "str q20, [x21, #0xa0]\n"
+ "str q17, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "str q16, [x27, #0x20]\n"
- "str q16, [x27, #0x30]\n"
- "str q16, [x27, #0x40]\n"
- "str q16, [x27, #0x50]\n"
- "str q16, [x27, #0x60]\n"
- "str q16, [x27, #0x70]\n"
- "str q16, [x27, #0x80]\n"
- "str q16, [x27, #0x90]\n"
- "str q16, [x27, #0xa0]\n"
- "str q16, [x27, #0xb0]\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 16 loop: loop
- "ldr q20, [x9], #0x10\n"
- "ldr q19, [x26], #0x10\n"
+ "blt 15f\n"
+ "14:" // Tail row loop: width 16 loop: loop
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x10\n"
- "ldr q0, [x9], #0x10\n"
- "ldr q31, [x26], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "ldr q16, [x24], #0x10\n"
- "ldr q23, [x9], #0x10\n"
- "zip1 v30.4s, v20.4s, v18.4s\n"
- "zip1 v29.4s, v19.4s, v17.4s\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip2 v28.4s, v20.4s, v18.4s\n"
- "zip2 v27.4s, v19.4s, v17.4s\n"
- "ldr q20, [x24], #0x10\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v30.4s, v21.4s, v17.4s\n"
+ "zip1 v29.4s, v20.4s, v16.4s\n"
"ldr q19, [x9], #0x10\n"
- "zip1 v26.4s, v0.4s, v24.4s\n"
- "zip1 v25.4s, v31.4s, v16.4s\n"
- "ldr q18, [x26], #0x10\n"
- "ldr q17, [x25], #0x10\n"
- "zip2 v8.4s, v0.4s, v24.4s\n"
- "zip2 v24.4s, v31.4s, v16.4s\n"
- "ldr q16, [x24], #0x10\n"
- "zip1 v7.4s, v23.4s, v21.4s\n"
- "zip1 v6.4s, v22.4s, v20.4s\n"
- "zip2 v5.4s, v23.4s, v21.4s\n"
- "zip2 v4.4s, v22.4s, v20.4s\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v28.4s, v21.4s, v17.4s\n"
+ "zip2 v27.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v26.4s, v19.4s, v17.4s\n"
+ "zip1 v25.4s, v18.4s, v16.4s\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v8.4s, v19.4s, v17.4s\n"
+ "zip2 v24.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v7.4s, v21.4s, v17.4s\n"
+ "zip1 v6.4s, v20.4s, v16.4s\n"
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x28], #0x10\n"
+ "zip2 v5.4s, v21.4s, v17.4s\n"
+ "zip2 v4.4s, v20.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
"zip1 v3.4s, v19.4s, v17.4s\n"
"zip1 v2.4s, v18.4s, v16.4s\n"
"zip2 v1.4s, v19.4s, v17.4s\n"
@@ -739,71 +698,70 @@ void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, si
"zip2 v16.4s, v1.4s, v0.4s\n"
".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n"
".inst 0x4ea16b9d // bfcvtn2 v29.8h, v28.4s\n"
+ "str q31, [x21, #0x0]\n"
".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
+ "str q29, [x21, #0x10]\n"
".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q27, [x21, #0x20]\n"
".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
- "str q31, [x27, #0x0]\n"
- "str q29, [x27, #0x10]\n"
- "str q27, [x27, #0x20]\n"
- "str q25, [x27, #0x30]\n"
- "str q23, [x27, #0x40]\n"
- "str q21, [x27, #0x50]\n"
- "str q19, [x27, #0x60]\n"
- "str q17, [x27, #0x70]\n"
- "add x27, x27, #0x80\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 16 loop: skip
+ "str q25, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q21, [x21, #0x50]\n"
+ "str q19, [x21, #0x60]\n"
+ "str q17, [x21, #0x70]\n"
+ "add x21, x21, #0x80\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
- "ldr q21, [x9], #0x10\n"
- "ldr q20, [x26], #0x10\n"
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
+ "ldr q20, [x9], #0x10\n"
+ "ldr q19, [x28], #0x10\n"
"sub x20, x20, #0x4\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q17, [x24], #0x10\n"
"cmp x20, #0x4\n"
- "zip1 v18.4s, v21.4s, v19.4s\n"
- "zip1 v16.4s, v20.4s, v17.4s\n"
- "zip2 v21.4s, v21.4s, v19.4s\n"
- "zip2 v20.4s, v20.4s, v17.4s\n"
- "zip1 v17.4s, v18.4s, v16.4s\n"
- "zip2 v19.4s, v18.4s, v16.4s\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.4s, v20.4s, v17.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "zip2 v21.4s, v20.4s, v17.4s\n"
+ "zip2 v20.4s, v19.4s, v16.4s\n"
+ "zip1 v17.4s, v22.4s, v18.4s\n"
"zip1 v16.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n"
- "zip2 v17.4s, v21.4s, v20.4s\n"
- ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
- ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n"
- "str q18, [x27, #0x0]\n"
- "str q16, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
+ "zip2 v18.4s, v22.4s, v18.4s\n"
+ ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
+ "zip2 v16.4s, v21.4s, v20.4s\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
+ "str q19, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr s19, [x9], #0x4\n"
- "ldr s18, [x26], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
"sub x20, x20, #0x1\n"
- "ldr s17, [x25], #0x4\n"
- "ldr s16, [x24], #0x4\n"
"cmp x20, #0x1\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
"zip1 v17.4s, v19.4s, v17.4s\n"
"zip1 v16.4s, v18.4s, v16.4s\n"
"zip1 v16.4s, v17.4s, v16.4s\n"
".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
- "str d16, [x27, #0x0]\n"
- "add x27, x27, #0x8\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0xc0\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
index 94c442b772..e2884ef80b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
@@ -34,266 +34,245 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
"sub x24, x24, #0x18\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q27, [x20], #0x10\n"
+ "shll v26.4s, v18.4h, #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ "ldr q25, [x20], #0x10\n"
+ "shll2 v24.4s, v18.8h, #0x10\n"
+ "shll v5.4s, v17.4h, #0x10\n"
+ "ldr q23, [x25], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "shll v21.4s, v23.4h, #0x10\n"
+ "shll2 v4.4s, v17.8h, #0x10\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q3, [x20], #0x10\n"
+ "shll v2.4s, v22.4h, #0x10\n"
+ "shll v1.4s, v16.4h, #0x10\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "shll2 v0.4s, v16.8h, #0x10\n"
+ "shll v31.4s, v20.4h, #0x10\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "shll v30.4s, v25.4h, #0x10\n"
+ "shll2 v29.4s, v25.8h, #0x10\n"
+ "shll v28.4s, v3.4h, #0x10\n"
+ "str q26, [x21, #0x0]\n"
"cmp x24, #0x18\n"
- "ldr q26, [x25], #0x10\n"
- "ldr q3, [x22], #0x10\n"
- "ldr q2, [x21], #0x10\n"
- "shll v16.4s, v19.4h, #0x10\n"
+ "shll2 v27.4s, v23.8h, #0x10\n"
+ "str q24, [x21, #0x10]\n"
+ "shll v26.4s, v19.4h, #0x10\n"
"shll2 v25.4s, v19.8h, #0x10\n"
- "ldr q1, [x20], #0x10\n"
- "ldr q24, [x25], #0x10\n"
+ "str q21, [x21, #0x20]\n"
+ "shll2 v24.4s, v22.8h, #0x10\n"
"shll v23.4s, v18.4h, #0x10\n"
+ "str q5, [x21, #0x30]\n"
"shll2 v22.4s, v18.8h, #0x10\n"
- "ldr q21, [x22], #0x10\n"
- "ldr q0, [x21], #0x10\n"
- "shll v20.4s, v26.4h, #0x10\n"
- "shll v19.4s, v3.4h, #0x10\n"
- "ldr q31, [x20], #0x10\n"
- "shll v18.4s, v17.4h, #0x10\n"
- "shll2 v17.4s, v17.8h, #0x10\n"
- "str q16, [x23, #0x0]\n"
- "shll v16.4s, v2.4h, #0x10\n"
- "shll v30.4s, v27.4h, #0x10\n"
- "str q25, [x23, #0x10]\n"
- "shll2 v29.4s, v27.8h, #0x10\n"
- "shll v28.4s, v1.4h, #0x10\n"
- "str q20, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "shll2 v27.4s, v26.8h, #0x10\n"
- "shll v26.4s, v24.4h, #0x10\n"
- "str q22, [x23, #0x40]\n"
- "shll2 v25.4s, v24.8h, #0x10\n"
- "shll2 v24.4s, v3.8h, #0x10\n"
- "str q19, [x23, #0x50]\n"
- "shll v23.4s, v21.4h, #0x10\n"
- "shll2 v22.4s, v21.8h, #0x10\n"
- "str q18, [x23, #0x60]\n"
- "shll2 v21.4s, v2.8h, #0x10\n"
- "shll v20.4s, v0.4h, #0x10\n"
- "str q17, [x23, #0x70]\n"
- "shll2 v19.4s, v0.8h, #0x10\n"
- "shll2 v18.4s, v1.8h, #0x10\n"
- "str q16, [x23, #0x80]\n"
- "shll v17.4s, v31.4h, #0x10\n"
- "shll2 v16.4s, v31.8h, #0x10\n"
- "str q30, [x23, #0x90]\n"
- "str q29, [x23, #0xa0]\n"
- "str q28, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q27, [x23, #0x0]\n"
- "str q26, [x23, #0x10]\n"
- "str q25, [x23, #0x20]\n"
- "str q24, [x23, #0x30]\n"
- "str q23, [x23, #0x40]\n"
- "str q22, [x23, #0x50]\n"
- "str q21, [x23, #0x60]\n"
- "str q20, [x23, #0x70]\n"
- "str q19, [x23, #0x80]\n"
- "str q18, [x23, #0x90]\n"
- "str q17, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "shll2 v21.4s, v20.8h, #0x10\n"
+ "str q4, [x21, #0x40]\n"
+ "shll v20.4s, v17.4h, #0x10\n"
+ "shll2 v19.4s, v17.8h, #0x10\n"
+ "str q2, [x21, #0x50]\n"
+ "shll2 v18.4s, v3.8h, #0x10\n"
+ "shll v17.4s, v16.4h, #0x10\n"
+ "str q1, [x21, #0x60]\n"
+ "shll2 v16.4s, v16.8h, #0x10\n"
+ "str q0, [x21, #0x70]\n"
+ "str q31, [x21, #0x80]\n"
+ "str q30, [x21, #0x90]\n"
+ "str q29, [x21, #0xa0]\n"
+ "str q28, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q27, [x21, #0x0]\n"
+ "str q26, [x21, #0x10]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q24, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q20, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
"ldr q16, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
+ "ldr q21, [x23], #0x10\n"
"sub x24, x24, #0xc\n"
- "ldr q27, [x21], #0x10\n"
- "ldr q26, [x20], #0x10\n"
"cmp x24, #0xc\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
- "shll v18.4s, v16.4h, #0x10\n"
- "shll2 v17.4s, v16.8h, #0x10\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q27, [x20], #0x10\n"
+ "shll v19.4s, v16.4h, #0x10\n"
+ "shll2 v26.4s, v16.8h, #0x10\n"
+ "ldr d16, [x25], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "shll v25.4s, v16.4h, #0x10\n"
+ "shll v24.4s, v21.4h, #0x10\n"
+ "ldr d17, [x22], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "shll v25.4s, v22.4h, #0x10\n"
- "shll2 v24.4s, v22.8h, #0x10\n"
- "shll v23.4s, v21.4h, #0x10\n"
- "shll v22.4s, v20.4h, #0x10\n"
- "shll v21.4s, v27.4h, #0x10\n"
- "shll2 v20.4s, v27.8h, #0x10\n"
- "str q18, [x23, #0x0]\n"
- "shll v19.4s, v19.4h, #0x10\n"
- "shll v18.4s, v26.4h, #0x10\n"
- "str q17, [x23, #0x10]\n"
- "shll2 v17.4s, v26.8h, #0x10\n"
+ "shll2 v23.4s, v21.8h, #0x10\n"
+ "shll v22.4s, v18.4h, #0x10\n"
+ "shll v21.4s, v20.4h, #0x10\n"
+ "shll2 v20.4s, v20.8h, #0x10\n"
+ "str q19, [x21, #0x0]\n"
+ "shll v19.4s, v17.4h, #0x10\n"
+ "shll v18.4s, v27.4h, #0x10\n"
+ "str q26, [x21, #0x10]\n"
+ "shll2 v17.4s, v27.8h, #0x10\n"
"shll v16.4s, v16.4h, #0x10\n"
- "str q23, [x23, #0x20]\n"
- "str q25, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q22, [x23, #0x50]\n"
- "str q21, [x23, #0x60]\n"
- "str q20, [x23, #0x70]\n"
- "str q19, [x23, #0x80]\n"
- "str q18, [x23, #0x90]\n"
- "str q17, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q24, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q20, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr d19, [x25], #0x8\n"
- "ldr d18, [x22], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"sub x24, x24, #0x4\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
"cmp x24, #0x4\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"shll v19.4s, v19.4h, #0x10\n"
"shll v18.4s, v18.4h, #0x10\n"
"shll v17.4s, v17.4h, #0x10\n"
"shll v16.4s, v16.4h, #0x10\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x30]\n"
- "str q17, [x23, #0x60]\n"
- "str q16, [x23, #0x90]\n"
- "add x23, x23, #0x10\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x90]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr h19, [x25], #0x2\n"
- "ldr h18, [x22], #0x2\n"
+ "ldr h18, [x23], #0x2\n"
"sub x24, x24, #0x1\n"
- "ldr h17, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
"cmp x24, #0x1\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
"shll v19.4s, v19.4h, #0x10\n"
"shll v18.4s, v18.4h, #0x10\n"
"shll v17.4s, v17.4h, #0x10\n"
"shll v16.4s, v16.4h, #0x10\n"
- "str s19, [x23, #0x0]\n"
- "str s18, [x23, #0x30]\n"
- "str s17, [x23, #0x60]\n"
- "str s16, [x23, #0x90]\n"
- "add x23, x23, #0x4\n"
+ "str s19, [x21, #0x0]\n"
+ "str s18, [x21, #0x30]\n"
+ "str s17, [x21, #0x60]\n"
+ "str s16, [x21, #0x90]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0xc0\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
"ldr q16, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
- "ldr q18, [x25], #0x10\n"
"ldr q20, [x25], #0x10\n"
+ "sub x20, x20, #0x18\n"
+ "shll v18.4s, v16.4h, #0x10\n"
+ "ldr q19, [x25], #0x10\n"
+ "shll2 v17.4s, v16.8h, #0x10\n"
+ "shll v16.4s, v20.4h, #0x10\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
"cmp x20, #0x18\n"
- "shll v17.4s, v16.4h, #0x10\n"
- "shll2 v16.4s, v16.8h, #0x10\n"
- "shll v19.4s, v18.4h, #0x10\n"
- "shll2 v18.4s, v18.8h, #0x10\n"
- "str q17, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "shll v17.4s, v20.4h, #0x10\n"
- "shll2 v16.4s, v20.8h, #0x10\n"
- "str q19, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q18, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "shll2 v18.4s, v20.8h, #0x10\n"
+ "shll v17.4s, v19.4h, #0x10\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "shll2 v16.4s, v19.8h, #0x10\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr q17, [x25], #0x10\n"
+ "ldr d18, [x25], #0x8\n"
"sub x20, x20, #0xc\n"
- "ldr d16, [x25], #0x8\n"
"cmp x20, #0xc\n"
- "shll v18.4s, v17.4h, #0x10\n"
+ "shll v16.4s, v17.4h, #0x10\n"
"shll2 v17.4s, v17.8h, #0x10\n"
- "shll v16.4s, v16.4h, #0x10\n"
- "str q18, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q16, [x21, #0x0]\n"
+ "shll v16.4s, v18.4h, #0x10\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d16, [x25], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"shll v16.4s, v16.4h, #0x10\n"
- "str q16, [x23, #0x0]\n"
- "add x23, x23, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h16, [x25], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"shll v16.4s, v16.4h, #0x10\n"
- "str s16, [x23, #0x0]\n"
- "add x23, x23, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x30\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
index 4d106f4e6d..5e64f812e1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
@@ -34,266 +34,245 @@ void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t w
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
"sub x24, x24, #0x18\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q27, [x20], #0x10\n"
+ "fcvtl v26.4s, v18.4h\n"
+ "ldr q16, [x22], #0x10\n"
+ "ldr q25, [x20], #0x10\n"
+ "fcvtl2 v24.4s, v18.8h\n"
+ "fcvtl v5.4s, v17.4h\n"
+ "ldr q23, [x25], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "fcvtl v21.4s, v23.4h\n"
+ "fcvtl2 v4.4s, v17.8h\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q3, [x20], #0x10\n"
+ "fcvtl v2.4s, v22.4h\n"
+ "fcvtl v1.4s, v16.4h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "fcvtl2 v0.4s, v16.8h\n"
+ "fcvtl v31.4s, v20.4h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "fcvtl v30.4s, v25.4h\n"
+ "fcvtl2 v29.4s, v25.8h\n"
+ "fcvtl v28.4s, v3.4h\n"
+ "str q26, [x21, #0x0]\n"
"cmp x24, #0x18\n"
- "ldr q26, [x25], #0x10\n"
- "ldr q3, [x22], #0x10\n"
- "ldr q2, [x21], #0x10\n"
- "fcvtl v16.4s, v19.4h\n"
+ "fcvtl2 v27.4s, v23.8h\n"
+ "str q24, [x21, #0x10]\n"
+ "fcvtl v26.4s, v19.4h\n"
"fcvtl2 v25.4s, v19.8h\n"
- "ldr q1, [x20], #0x10\n"
- "ldr q24, [x25], #0x10\n"
+ "str q21, [x21, #0x20]\n"
+ "fcvtl2 v24.4s, v22.8h\n"
"fcvtl v23.4s, v18.4h\n"
+ "str q5, [x21, #0x30]\n"
"fcvtl2 v22.4s, v18.8h\n"
- "ldr q21, [x22], #0x10\n"
- "ldr q0, [x21], #0x10\n"
- "fcvtl v20.4s, v26.4h\n"
- "fcvtl v19.4s, v3.4h\n"
- "ldr q31, [x20], #0x10\n"
- "fcvtl v18.4s, v17.4h\n"
- "fcvtl2 v17.4s, v17.8h\n"
- "str q16, [x23, #0x0]\n"
- "fcvtl v16.4s, v2.4h\n"
- "fcvtl v30.4s, v27.4h\n"
- "str q25, [x23, #0x10]\n"
- "fcvtl2 v29.4s, v27.8h\n"
- "fcvtl v28.4s, v1.4h\n"
- "str q20, [x23, #0x20]\n"
- "str q23, [x23, #0x30]\n"
- "fcvtl2 v27.4s, v26.8h\n"
- "fcvtl v26.4s, v24.4h\n"
- "str q22, [x23, #0x40]\n"
- "fcvtl2 v25.4s, v24.8h\n"
- "fcvtl2 v24.4s, v3.8h\n"
- "str q19, [x23, #0x50]\n"
- "fcvtl v23.4s, v21.4h\n"
- "fcvtl2 v22.4s, v21.8h\n"
- "str q18, [x23, #0x60]\n"
- "fcvtl2 v21.4s, v2.8h\n"
- "fcvtl v20.4s, v0.4h\n"
- "str q17, [x23, #0x70]\n"
- "fcvtl2 v19.4s, v0.8h\n"
- "fcvtl2 v18.4s, v1.8h\n"
- "str q16, [x23, #0x80]\n"
- "fcvtl v17.4s, v31.4h\n"
- "fcvtl2 v16.4s, v31.8h\n"
- "str q30, [x23, #0x90]\n"
- "str q29, [x23, #0xa0]\n"
- "str q28, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q27, [x23, #0x0]\n"
- "str q26, [x23, #0x10]\n"
- "str q25, [x23, #0x20]\n"
- "str q24, [x23, #0x30]\n"
- "str q23, [x23, #0x40]\n"
- "str q22, [x23, #0x50]\n"
- "str q21, [x23, #0x60]\n"
- "str q20, [x23, #0x70]\n"
- "str q19, [x23, #0x80]\n"
- "str q18, [x23, #0x90]\n"
- "str q17, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "fcvtl2 v21.4s, v20.8h\n"
+ "str q4, [x21, #0x40]\n"
+ "fcvtl v20.4s, v17.4h\n"
+ "fcvtl2 v19.4s, v17.8h\n"
+ "str q2, [x21, #0x50]\n"
+ "fcvtl2 v18.4s, v3.8h\n"
+ "fcvtl v17.4s, v16.4h\n"
+ "str q1, [x21, #0x60]\n"
+ "fcvtl2 v16.4s, v16.8h\n"
+ "str q0, [x21, #0x70]\n"
+ "str q31, [x21, #0x80]\n"
+ "str q30, [x21, #0x90]\n"
+ "str q29, [x21, #0xa0]\n"
+ "str q28, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q27, [x21, #0x0]\n"
+ "str q26, [x21, #0x10]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q24, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q20, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cmp x24, #0xc\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
"ldr q16, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
+ "ldr q21, [x23], #0x10\n"
"sub x24, x24, #0xc\n"
- "ldr q27, [x21], #0x10\n"
- "ldr q26, [x20], #0x10\n"
"cmp x24, #0xc\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
- "fcvtl v18.4s, v16.4h\n"
- "fcvtl2 v17.4s, v16.8h\n"
+ "ldr q20, [x22], #0x10\n"
+ "ldr q27, [x20], #0x10\n"
+ "fcvtl v19.4s, v16.4h\n"
+ "fcvtl2 v26.4s, v16.8h\n"
+ "ldr d16, [x25], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "fcvtl v25.4s, v16.4h\n"
+ "fcvtl v24.4s, v21.4h\n"
+ "ldr d17, [x22], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "fcvtl v25.4s, v22.4h\n"
- "fcvtl2 v24.4s, v22.8h\n"
- "fcvtl v23.4s, v21.4h\n"
- "fcvtl v22.4s, v20.4h\n"
- "fcvtl v21.4s, v27.4h\n"
- "fcvtl2 v20.4s, v27.8h\n"
- "str q18, [x23, #0x0]\n"
- "fcvtl v19.4s, v19.4h\n"
- "fcvtl v18.4s, v26.4h\n"
- "str q17, [x23, #0x10]\n"
- "fcvtl2 v17.4s, v26.8h\n"
+ "fcvtl2 v23.4s, v21.8h\n"
+ "fcvtl v22.4s, v18.4h\n"
+ "fcvtl v21.4s, v20.4h\n"
+ "fcvtl2 v20.4s, v20.8h\n"
+ "str q19, [x21, #0x0]\n"
+ "fcvtl v19.4s, v17.4h\n"
+ "fcvtl v18.4s, v27.4h\n"
+ "str q26, [x21, #0x10]\n"
+ "fcvtl2 v17.4s, v27.8h\n"
"fcvtl v16.4s, v16.4h\n"
- "str q23, [x23, #0x20]\n"
- "str q25, [x23, #0x30]\n"
- "str q24, [x23, #0x40]\n"
- "str q22, [x23, #0x50]\n"
- "str q21, [x23, #0x60]\n"
- "str q20, [x23, #0x70]\n"
- "str q19, [x23, #0x80]\n"
- "str q18, [x23, #0x90]\n"
- "str q17, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "str q25, [x21, #0x20]\n"
+ "str q24, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q22, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q20, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr d19, [x25], #0x8\n"
- "ldr d18, [x22], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"sub x24, x24, #0x4\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
"cmp x24, #0x4\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"fcvtl v19.4s, v19.4h\n"
"fcvtl v18.4s, v18.4h\n"
"fcvtl v17.4s, v17.4h\n"
"fcvtl v16.4s, v16.4h\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x30]\n"
- "str q17, [x23, #0x60]\n"
- "str q16, [x23, #0x90]\n"
- "add x23, x23, #0x10\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x90]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr h19, [x25], #0x2\n"
- "ldr h18, [x22], #0x2\n"
+ "ldr h18, [x23], #0x2\n"
"sub x24, x24, #0x1\n"
- "ldr h17, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
"cmp x24, #0x1\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
"fcvtl v19.4s, v19.4h\n"
"fcvtl v18.4s, v18.4h\n"
"fcvtl v17.4s, v17.4h\n"
"fcvtl v16.4s, v16.4h\n"
- "str s19, [x23, #0x0]\n"
- "str s18, [x23, #0x30]\n"
- "str s17, [x23, #0x60]\n"
- "str s16, [x23, #0x90]\n"
- "add x23, x23, #0x4\n"
+ "str s19, [x21, #0x0]\n"
+ "str s18, [x21, #0x30]\n"
+ "str s17, [x21, #0x60]\n"
+ "str s16, [x21, #0x90]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0xc0\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Unroll column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Unroll column loop
"ldr q16, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
- "ldr q18, [x25], #0x10\n"
"ldr q20, [x25], #0x10\n"
+ "sub x20, x20, #0x18\n"
+ "fcvtl v18.4s, v16.4h\n"
+ "ldr q19, [x25], #0x10\n"
+ "fcvtl2 v17.4s, v16.8h\n"
+ "fcvtl v16.4s, v20.4h\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
"cmp x20, #0x18\n"
- "fcvtl v17.4s, v16.4h\n"
- "fcvtl2 v16.4s, v16.8h\n"
- "fcvtl v19.4s, v18.4h\n"
- "fcvtl2 v18.4s, v18.8h\n"
- "str q17, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "fcvtl v17.4s, v20.4h\n"
- "fcvtl2 v16.4s, v20.8h\n"
- "str q19, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q18, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Unroll column loop skip
+ "fcvtl2 v18.4s, v20.8h\n"
+ "fcvtl v17.4s, v19.4h\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "fcvtl2 v16.4s, v19.8h\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Unroll column loop skip
"cmp x20, #0xc\n"
- "blt 16f\n"
- "15:" // Tail row loop: Column loop
+ "blt 15f\n"
+ "14:" // Tail row loop: Column loop
"ldr q17, [x25], #0x10\n"
+ "ldr d18, [x25], #0x8\n"
"sub x20, x20, #0xc\n"
- "ldr d16, [x25], #0x8\n"
"cmp x20, #0xc\n"
- "fcvtl v18.4s, v17.4h\n"
+ "fcvtl v16.4s, v17.4h\n"
"fcvtl2 v17.4s, v17.8h\n"
- "fcvtl v16.4s, v16.4h\n"
- "str q18, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q16, [x21, #0x0]\n"
+ "fcvtl v16.4s, v18.4h\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Column loop skip
"cmp x20, #0x4\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d16, [x25], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"fcvtl v16.4s, v16.4h\n"
- "str q16, [x23, #0x0]\n"
- "add x23, x23, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h16, [x25], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"fcvtl v16.4s, v16.4h\n"
- "str s16, [x23, #0x0]\n"
- "add x23, x23, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x30\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
index 8c16e5ba46..918d3ffaa3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
@@ -40,16 +40,14 @@ void a64_transpose_interleave_32_1x4(uint8_t *out, const uint8_t *in, size_t wid
__asm__ __volatile__(
"cmp %x[height], #0x10\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x17, %x[in]\n"
- "mov x16, %x[width]\n"
- "mov x15, %x[out]\n"
- "sub %x[height], %x[height], #0x10\n"
- "add x14, x17, %x[in_stride]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
+ "add x14, x15, %x[in_stride]\n"
"add x13, x14, %x[in_stride]\n"
"add x12, x13, %x[in_stride]\n"
- "cmp x16, #0x20\n"
"add x11, x12, %x[in_stride]\n"
"add x10, x11, %x[in_stride]\n"
"add x9, x10, %x[in_stride]\n"
@@ -57,464 +55,420 @@ void a64_transpose_interleave_32_1x4(uint8_t *out, const uint8_t *in, size_t wid
"add x27, x28, %x[in_stride]\n"
"add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x20\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x10\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
- "ldr q28, [x17], #0x10\n"
- "ldr q6, [x14], #0x10\n"
- "sub x16, x16, #0x20\n"
- "ldr q1, [x13], #0x10\n"
- "ldr q30, [x12], #0x10\n"
- "cmp x16, #0x20\n"
- "ldr q21, [x11], #0x10\n"
- "ldr q9, [x10], #0x10\n"
- "ldr q15, [x9], #0x10\n"
- "ldr q17, [x28], #0x10\n"
- "ldr q19, [x27], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "zip1 v4.16b, v28.16b, v1.16b\n"
- "zip1 v3.16b, v6.16b, v30.16b\n"
- "ldr q26, [x25], #0x10\n"
- "ldr q11, [x24], #0x10\n"
- "zip2 v29.16b, v28.16b, v1.16b\n"
- "zip2 v16.16b, v6.16b, v30.16b\n"
- "ldr q25, [x23], #0x10\n"
- "ldr q7, [x22], #0x10\n"
- "zip1 v0.16b, v21.16b, v15.16b\n"
- "zip1 v1.16b, v9.16b, v17.16b\n"
- "ldr q23, [x21], #0x10\n"
- "ldr q28, [x20], #0x10\n"
- "zip2 v20.16b, v21.16b, v15.16b\n"
- "zip2 v24.16b, v9.16b, v17.16b\n"
+ "ldr q6, [x17], #0x10\n"
+ "ldr q31, [x16], #0x10\n"
+ "sub x24, x24, #0x20\n"
+ "cmp x24, #0x20\n"
+ "ldr q7, [x15], #0x10\n"
+ "ldr q0, [x14], #0x10\n"
+ "zip1 v9.16b, v6.16b, v7.16b\n"
+ "zip1 v20.16b, v31.16b, v0.16b\n"
+ "ldr q24, [x13], #0x10\n"
+ "ldr q19, [x12], #0x10\n"
+ "zip2 v30.16b, v6.16b, v7.16b\n"
+ "zip2 v12.16b, v31.16b, v0.16b\n"
+ "ldr q23, [x11], #0x10\n"
+ "ldr q17, [x10], #0x10\n"
+ "zip1 v13.16b, v24.16b, v23.16b\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "ldr q0, [x9], #0x10\n"
+ "ldr q31, [x28], #0x10\n"
+ "zip2 v15.16b, v24.16b, v23.16b\n"
+ "zip2 v11.16b, v19.16b, v17.16b\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q4, [x26], #0x10\n"
+ "zip1 v1.16b, v0.16b, v17.16b\n"
+ "zip1 v21.16b, v31.16b, v4.16b\n"
+ "ldr q28, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v0.16b, v0.16b, v17.16b\n"
+ "zip2 v26.16b, v31.16b, v4.16b\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q19, [x20], #0x10\n"
+ "zip1 v23.16b, v28.16b, v17.16b\n"
+ "zip1 v25.16b, v18.16b, v19.16b\n"
"ldr q2, [x17], #0x10\n"
+ "ldr q3, [x16], #0x10\n"
+ "zip2 v7.16b, v28.16b, v17.16b\n"
+ "zip2 v8.16b, v18.16b, v19.16b\n"
+ "ldr q22, [x15], #0x10\n"
"ldr q27, [x14], #0x10\n"
- "zip1 v15.16b, v19.16b, v26.16b\n"
- "zip1 v21.16b, v22.16b, v11.16b\n"
- "ldr q18, [x13], #0x10\n"
- "ldr q8, [x12], #0x10\n"
- "zip2 v14.16b, v19.16b, v26.16b\n"
- "zip2 v13.16b, v22.16b, v11.16b\n"
- "ldr q19, [x11], #0x10\n"
- "ldr q10, [x10], #0x10\n"
- "zip1 v12.16b, v25.16b, v23.16b\n"
- "zip1 v5.16b, v7.16b, v28.16b\n"
- "ldr q6, [x9], #0x10\n"
- "ldr q30, [x28], #0x10\n"
- "zip2 v31.16b, v25.16b, v23.16b\n"
- "zip2 v11.16b, v7.16b, v28.16b\n"
- "ldr q25, [x27], #0x10\n"
- "ldr q7, [x26], #0x10\n"
- "zip1 v23.16b, v2.16b, v18.16b\n"
- "zip1 v17.16b, v27.16b, v8.16b\n"
- "ldr q26, [x25], #0x10\n"
- "ldr q9, [x24], #0x10\n"
- "zip2 v18.16b, v2.16b, v18.16b\n"
- "zip2 v22.16b, v27.16b, v8.16b\n"
- "ldr q28, [x23], #0x10\n"
- "ldr q8, [x22], #0x10\n"
- "zip1 v2.16b, v19.16b, v6.16b\n"
- "zip1 v27.16b, v10.16b, v30.16b\n"
- "zip2 v6.16b, v19.16b, v6.16b\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v30.16b, v10.16b, v30.16b\n"
- "zip1 v10.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip1 v25.16b, v7.16b, v9.16b\n"
- "zip2 v9.16b, v7.16b, v9.16b\n"
- "zip1 v7.16b, v28.16b, v19.16b\n"
- "zip2 v19.16b, v28.16b, v19.16b\n"
- "zip1 v28.16b, v4.16b, v3.16b\n"
- "zip2 v3.16b, v4.16b, v3.16b\n"
- "ldr q4, [x20], #0x10\n"
- "str q28, [x15, #0x0]\n"
- "zip1 v28.16b, v29.16b, v16.16b\n"
- "zip2 v16.16b, v29.16b, v16.16b\n"
- "zip1 v29.16b, v23.16b, v17.16b\n"
- "zip2 v23.16b, v23.16b, v17.16b\n"
- "zip1 v17.16b, v8.16b, v4.16b\n"
- "zip2 v4.16b, v8.16b, v4.16b\n"
- "str q3, [x15, #0x10]\n"
- "zip1 v8.16b, v18.16b, v22.16b\n"
- "zip2 v22.16b, v18.16b, v22.16b\n"
- "str q28, [x15, #0x20]\n"
- "zip1 v28.16b, v0.16b, v1.16b\n"
- "zip2 v3.16b, v0.16b, v1.16b\n"
- "str q16, [x15, #0x30]\n"
- "zip1 v16.16b, v20.16b, v24.16b\n"
- "zip2 v18.16b, v20.16b, v24.16b\n"
- "str q29, [x15, #0x40]\n"
- "zip1 v1.16b, v2.16b, v27.16b\n"
- "zip2 v24.16b, v2.16b, v27.16b\n"
- "str q23, [x15, #0x50]\n"
- "zip1 v2.16b, v6.16b, v30.16b\n"
- "zip2 v6.16b, v6.16b, v30.16b\n"
- "str q8, [x15, #0x60]\n"
- "zip1 v0.16b, v15.16b, v21.16b\n"
- "zip2 v30.16b, v15.16b, v21.16b\n"
- "str q22, [x15, #0x70]\n"
- "zip1 v29.16b, v14.16b, v13.16b\n"
- "zip2 v23.16b, v14.16b, v13.16b\n"
- "str q28, [x15, #0x80]\n"
- "zip1 v27.16b, v10.16b, v25.16b\n"
- "zip2 v8.16b, v10.16b, v25.16b\n"
- "str q3, [x15, #0x90]\n"
- "zip1 v15.16b, v26.16b, v9.16b\n"
- "zip2 v9.16b, v26.16b, v9.16b\n"
- "str q16, [x15, #0xa0]\n"
- "zip1 v28.16b, v12.16b, v5.16b\n"
- "zip2 v22.16b, v12.16b, v5.16b\n"
- "str q18, [x15, #0xb0]\n"
- "zip1 v21.16b, v31.16b, v11.16b\n"
- "zip2 v20.16b, v31.16b, v11.16b\n"
- "str q1, [x15, #0xc0]\n"
- "zip1 v25.16b, v7.16b, v17.16b\n"
- "zip2 v18.16b, v7.16b, v17.16b\n"
- "str q24, [x15, #0xd0]\n"
- "zip1 v17.16b, v19.16b, v4.16b\n"
- "zip2 v16.16b, v19.16b, v4.16b\n"
- "str q2, [x15, #0xe0]\n"
- "str q6, [x15, #0xf0]\n"
- "str q0, [x15, #0x100]\n"
- "str q30, [x15, #0x110]\n"
- "str q29, [x15, #0x120]\n"
- "str q23, [x15, #0x130]\n"
- "str q27, [x15, #0x140]\n"
- "str q8, [x15, #0x150]\n"
- "str q15, [x15, #0x160]\n"
- "str q9, [x15, #0x170]\n"
- "str q28, [x15, #0x180]\n"
- "str q22, [x15, #0x190]\n"
- "str q21, [x15, #0x1a0]\n"
- "str q20, [x15, #0x1b0]\n"
- "str q25, [x15, #0x1c0]\n"
- "str q18, [x15, #0x1d0]\n"
- "str q17, [x15, #0x1e0]\n"
- "str q16, [x15, #0x1f0]\n"
- "add x15, x15, %x[out_stride]\n"
+ "zip1 v19.16b, v2.16b, v22.16b\n"
+ "zip1 v17.16b, v3.16b, v27.16b\n"
+ "ldr q6, [x13], #0x10\n"
+ "ldr q4, [x12], #0x10\n"
+ "zip2 v24.16b, v2.16b, v22.16b\n"
+ "zip2 v22.16b, v3.16b, v27.16b\n"
+ "ldr q14, [x11], #0x10\n"
+ "ldr q18, [x10], #0x10\n"
+ "zip1 v29.16b, v6.16b, v14.16b\n"
+ "zip1 v31.16b, v4.16b, v18.16b\n"
+ "ldr q2, [x9], #0x10\n"
+ "ldr q10, [x28], #0x10\n"
+ "zip2 v28.16b, v6.16b, v14.16b\n"
+ "zip2 v27.16b, v4.16b, v18.16b\n"
+ "ldr q6, [x27], #0x10\n"
+ "ldr q5, [x26], #0x10\n"
+ "zip1 v14.16b, v2.16b, v6.16b\n"
+ "zip1 v4.16b, v10.16b, v5.16b\n"
+ "ldr q3, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v6.16b, v2.16b, v6.16b\n"
+ "zip2 v10.16b, v10.16b, v5.16b\n"
+ "ldr q5, [x22], #0x10\n"
+ "zip1 v2.16b, v3.16b, v5.16b\n"
+ "zip2 v3.16b, v3.16b, v5.16b\n"
+ "zip1 v5.16b, v9.16b, v20.16b\n"
+ "zip2 v20.16b, v9.16b, v20.16b\n"
+ "ldr q9, [x20], #0x10\n"
+ "str q5, [x21, #0x0]\n"
+ "zip1 v5.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "str q20, [x21, #0x10]\n"
+ "zip1 v18.16b, v30.16b, v12.16b\n"
+ "zip2 v30.16b, v30.16b, v12.16b\n"
+ "str q18, [x21, #0x20]\n"
+ "zip1 v20.16b, v19.16b, v17.16b\n"
+ "zip2 v12.16b, v19.16b, v17.16b\n"
+ "str q30, [x21, #0x30]\n"
+ "zip1 v18.16b, v24.16b, v22.16b\n"
+ "zip2 v17.16b, v24.16b, v22.16b\n"
+ "str q20, [x21, #0x40]\n"
+ "zip1 v30.16b, v13.16b, v16.16b\n"
+ "zip2 v24.16b, v13.16b, v16.16b\n"
+ "str q12, [x21, #0x50]\n"
+ "zip1 v22.16b, v15.16b, v11.16b\n"
+ "zip2 v20.16b, v15.16b, v11.16b\n"
+ "str q18, [x21, #0x60]\n"
+ "zip1 v19.16b, v29.16b, v31.16b\n"
+ "zip2 v18.16b, v29.16b, v31.16b\n"
+ "str q17, [x21, #0x70]\n"
+ "zip1 v17.16b, v28.16b, v27.16b\n"
+ "zip2 v16.16b, v28.16b, v27.16b\n"
+ "str q30, [x21, #0x80]\n"
+ "zip1 v31.16b, v1.16b, v21.16b\n"
+ "zip2 v1.16b, v1.16b, v21.16b\n"
+ "str q24, [x21, #0x90]\n"
+ "zip1 v30.16b, v0.16b, v26.16b\n"
+ "zip2 v29.16b, v0.16b, v26.16b\n"
+ "str q22, [x21, #0xa0]\n"
+ "zip1 v28.16b, v14.16b, v4.16b\n"
+ "zip2 v27.16b, v14.16b, v4.16b\n"
+ "str q20, [x21, #0xb0]\n"
+ "zip1 v26.16b, v6.16b, v10.16b\n"
+ "zip2 v24.16b, v6.16b, v10.16b\n"
+ "str q19, [x21, #0xc0]\n"
+ "zip1 v14.16b, v23.16b, v25.16b\n"
+ "zip2 v22.16b, v23.16b, v25.16b\n"
+ "str q18, [x21, #0xd0]\n"
+ "zip1 v21.16b, v7.16b, v8.16b\n"
+ "zip2 v20.16b, v7.16b, v8.16b\n"
+ "str q17, [x21, #0xe0]\n"
+ "zip1 v19.16b, v2.16b, v5.16b\n"
+ "zip2 v18.16b, v2.16b, v5.16b\n"
+ "str q16, [x21, #0xf0]\n"
+ "zip1 v17.16b, v3.16b, v9.16b\n"
+ "zip2 v16.16b, v3.16b, v9.16b\n"
+ "str q31, [x21, #0x100]\n"
+ "str q1, [x21, #0x110]\n"
+ "str q30, [x21, #0x120]\n"
+ "str q29, [x21, #0x130]\n"
+ "str q28, [x21, #0x140]\n"
+ "str q27, [x21, #0x150]\n"
+ "str q26, [x21, #0x160]\n"
+ "str q24, [x21, #0x170]\n"
+ "str q14, [x21, #0x180]\n"
+ "str q22, [x21, #0x190]\n"
+ "str q21, [x21, #0x1a0]\n"
+ "str q20, [x21, #0x1b0]\n"
+ "str q19, [x21, #0x1c0]\n"
+ "str q18, [x21, #0x1d0]\n"
+ "str q17, [x21, #0x1e0]\n"
+ "str q16, [x21, #0x1f0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x16, 10f\n"
- "cmp x16, #0x10\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
- "str q16, [x15, #0x10]\n"
- "str q16, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "str q16, [x15, #0x40]\n"
- "str q16, [x15, #0x50]\n"
- "str q16, [x15, #0x60]\n"
- "str q16, [x15, #0x70]\n"
- "str q16, [x15, #0x80]\n"
- "str q16, [x15, #0x90]\n"
- "str q16, [x15, #0xa0]\n"
- "str q16, [x15, #0xb0]\n"
- "str q16, [x15, #0xc0]\n"
- "str q16, [x15, #0xd0]\n"
- "str q16, [x15, #0xe0]\n"
- "str q16, [x15, #0xf0]\n"
- "str q16, [x15, #0x100]\n"
- "str q16, [x15, #0x110]\n"
- "str q16, [x15, #0x120]\n"
- "str q16, [x15, #0x130]\n"
- "str q16, [x15, #0x140]\n"
- "str q16, [x15, #0x150]\n"
- "str q16, [x15, #0x160]\n"
- "str q16, [x15, #0x170]\n"
- "str q16, [x15, #0x180]\n"
- "str q16, [x15, #0x190]\n"
- "str q16, [x15, #0x1a0]\n"
- "str q16, [x15, #0x1b0]\n"
- "str q16, [x15, #0x1c0]\n"
- "str q16, [x15, #0x1d0]\n"
- "str q16, [x15, #0x1e0]\n"
- "str q16, [x15, #0x1f0]\n"
+ "cmp x24, #0x10\n"
"blt 5f\n"
"4:" // Main row loop: width 16 loop: loop
- "ldr q19, [x17], #0x10\n"
- "ldr q18, [x14], #0x10\n"
- "sub x16, x16, #0x10\n"
- "ldr q17, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
- "cmp x16, #0x10\n"
- "ldr q24, [x11], #0x10\n"
- "ldr q28, [x10], #0x10\n"
- "ldr q23, [x9], #0x10\n"
- "ldr q22, [x28], #0x10\n"
- "ldr q27, [x27], #0x10\n"
- "ldr q26, [x26], #0x10\n"
- "zip1 v5.16b, v19.16b, v17.16b\n"
- "zip1 v4.16b, v18.16b, v16.16b\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip2 v3.16b, v19.16b, v17.16b\n"
- "zip2 v2.16b, v18.16b, v16.16b\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v1.16b, v24.16b, v23.16b\n"
- "zip1 v25.16b, v28.16b, v22.16b\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q21, [x17], #0x10\n"
+ "ldr q20, [x16], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q17, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
+ "zip1 v3.16b, v21.16b, v17.16b\n"
+ "zip1 v2.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x13], #0x10\n"
+ "ldr q18, [x12], #0x10\n"
+ "zip2 v1.16b, v21.16b, v17.16b\n"
+ "zip2 v0.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x11], #0x10\n"
+ "ldr q16, [x10], #0x10\n"
+ "zip1 v31.16b, v19.16b, v17.16b\n"
+ "zip1 v30.16b, v18.16b, v16.16b\n"
+ "ldr q24, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v29.16b, v19.16b, v17.16b\n"
+ "zip2 v23.16b, v18.16b, v16.16b\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v22.16b, v24.16b, v17.16b\n"
+ "zip1 v21.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v28.16b, v24.16b, v17.16b\n"
+ "zip2 v20.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "zip2 v23.16b, v28.16b, v22.16b\n"
- "zip1 v0.16b, v27.16b, v21.16b\n"
- "zip1 v22.16b, v26.16b, v20.16b\n"
- "zip2 v31.16b, v27.16b, v21.16b\n"
- "zip2 v30.16b, v26.16b, v20.16b\n"
- "zip1 v29.16b, v19.16b, v17.16b\n"
- "zip1 v28.16b, v18.16b, v16.16b\n"
- "zip2 v27.16b, v19.16b, v17.16b\n"
- "zip2 v26.16b, v18.16b, v16.16b\n"
- "zip1 v21.16b, v5.16b, v4.16b\n"
- "zip2 v20.16b, v5.16b, v4.16b\n"
- "zip1 v19.16b, v3.16b, v2.16b\n"
+ "zip1 v27.16b, v19.16b, v17.16b\n"
+ "zip1 v26.16b, v18.16b, v16.16b\n"
+ "zip2 v25.16b, v19.16b, v17.16b\n"
+ "zip2 v24.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v3.16b, v2.16b\n"
"zip2 v18.16b, v3.16b, v2.16b\n"
- "zip1 v17.16b, v1.16b, v25.16b\n"
- "zip2 v16.16b, v1.16b, v25.16b\n"
- "str q21, [x15, #0x0]\n"
- "str q20, [x15, #0x10]\n"
- "zip1 v25.16b, v24.16b, v23.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "str q19, [x15, #0x20]\n"
- "zip1 v23.16b, v0.16b, v22.16b\n"
- "zip2 v22.16b, v0.16b, v22.16b\n"
- "str q18, [x15, #0x30]\n"
- "zip1 v21.16b, v31.16b, v30.16b\n"
- "zip2 v20.16b, v31.16b, v30.16b\n"
- "str q17, [x15, #0x80]\n"
- "zip1 v19.16b, v29.16b, v28.16b\n"
- "zip2 v18.16b, v29.16b, v28.16b\n"
- "str q16, [x15, #0x90]\n"
- "zip1 v17.16b, v27.16b, v26.16b\n"
- "zip2 v16.16b, v27.16b, v26.16b\n"
- "str q25, [x15, #0xa0]\n"
- "str q24, [x15, #0xb0]\n"
- "str q23, [x15, #0x100]\n"
- "str q22, [x15, #0x110]\n"
- "str q21, [x15, #0x120]\n"
- "str q20, [x15, #0x130]\n"
- "str q19, [x15, #0x180]\n"
- "str q18, [x15, #0x190]\n"
- "str q17, [x15, #0x1a0]\n"
- "str q16, [x15, #0x1b0]\n"
- "add x15, x15, #0x40\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v17.16b, v1.16b, v0.16b\n"
+ "zip2 v16.16b, v1.16b, v0.16b\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "zip1 v19.16b, v31.16b, v30.16b\n"
+ "zip2 v18.16b, v31.16b, v30.16b\n"
+ "str q16, [x21, #0x30]\n"
+ "zip1 v17.16b, v29.16b, v23.16b\n"
+ "zip2 v16.16b, v29.16b, v23.16b\n"
+ "str q19, [x21, #0x80]\n"
+ "zip1 v23.16b, v22.16b, v21.16b\n"
+ "zip2 v22.16b, v22.16b, v21.16b\n"
+ "str q18, [x21, #0x90]\n"
+ "zip1 v21.16b, v28.16b, v20.16b\n"
+ "zip2 v20.16b, v28.16b, v20.16b\n"
+ "str q17, [x21, #0xa0]\n"
+ "zip1 v19.16b, v27.16b, v26.16b\n"
+ "zip2 v18.16b, v27.16b, v26.16b\n"
+ "str q16, [x21, #0xb0]\n"
+ "zip1 v17.16b, v25.16b, v24.16b\n"
+ "zip2 v16.16b, v25.16b, v24.16b\n"
+ "str q23, [x21, #0x100]\n"
+ "str q22, [x21, #0x110]\n"
+ "str q21, [x21, #0x120]\n"
+ "str q20, [x21, #0x130]\n"
+ "str q19, [x21, #0x180]\n"
+ "str q18, [x21, #0x190]\n"
+ "str q17, [x21, #0x1a0]\n"
+ "str q16, [x21, #0x1b0]\n"
+ "add x21, x21, #0x40\n"
"bge 4b\n"
"5:" // Main row loop: width 16 loop: skip
- "cmp x16, #0x4\n"
+ "cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
- "ldr s23, [x17], #0x4\n"
- "ldr s21, [x14], #0x4\n"
- "sub x16, x16, #0x4\n"
- "ldr s20, [x13], #0x4\n"
- "ldr s19, [x12], #0x4\n"
- "cmp x16, #0x4\n"
- "ldr s22, [x11], #0x4\n"
- "ldr s18, [x10], #0x4\n"
- "ldr s17, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s27, [x27], #0x4\n"
- "ldr s26, [x26], #0x4\n"
- "zip1 v25.16b, v23.16b, v20.16b\n"
- "zip1 v21.16b, v21.16b, v19.16b\n"
+ "ldr s19, [x17], #0x4\n"
+ "ldr s18, [x16], #0x4\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr s16, [x14], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr s19, [x13], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
+ "zip1 v22.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x11], #0x4\n"
+ "ldr s16, [x10], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "zip1 v21.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
"ldr s20, [x25], #0x4\n"
- "ldr s19, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s23, [x22], #0x4\n"
- "zip1 v22.16b, v22.16b, v17.16b\n"
- "zip1 v17.16b, v18.16b, v16.16b\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x22], #0x4\n"
"ldr s16, [x20], #0x4\n"
- "zip1 v21.16b, v25.16b, v21.16b\n"
- "zip1 v20.16b, v27.16b, v20.16b\n"
- "zip1 v19.16b, v26.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "str q21, [x15, #0x0]\n"
- "str q17, [x15, #0x80]\n"
- "zip1 v17.16b, v20.16b, v19.16b\n"
- "zip1 v16.16b, v18.16b, v16.16b\n"
- "str q17, [x15, #0x100]\n"
- "str q16, [x15, #0x180]\n"
- "add x15, x15, #0x10\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str q22, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str q21, [x21, #0x80]\n"
+ "str q18, [x21, #0x100]\n"
+ "str q16, [x21, #0x180]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
- "cmp x16, #0x1\n"
+ "cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
- "ldr b23, [x17], #0x1\n"
- "ldr b21, [x14], #0x1\n"
- "sub x16, x16, #0x1\n"
- "ldr b20, [x13], #0x1\n"
- "ldr b19, [x12], #0x1\n"
- "cmp x16, #0x1\n"
- "ldr b22, [x11], #0x1\n"
- "ldr b18, [x10], #0x1\n"
- "ldr b17, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b27, [x27], #0x1\n"
- "ldr b26, [x26], #0x1\n"
- "zip1 v25.16b, v23.16b, v20.16b\n"
- "zip1 v21.16b, v21.16b, v19.16b\n"
+ "ldr b19, [x17], #0x1\n"
+ "ldr b18, [x16], #0x1\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr b17, [x15], #0x1\n"
+ "ldr b16, [x14], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr b19, [x13], #0x1\n"
+ "ldr b18, [x12], #0x1\n"
+ "zip1 v22.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x11], #0x1\n"
+ "ldr b16, [x10], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr b19, [x9], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
+ "zip1 v21.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b16, [x26], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
"ldr b20, [x25], #0x1\n"
- "ldr b19, [x24], #0x1\n"
- "ldr b24, [x23], #0x1\n"
- "ldr b23, [x22], #0x1\n"
- "zip1 v22.16b, v22.16b, v17.16b\n"
- "zip1 v17.16b, v18.16b, v16.16b\n"
- "ldr b18, [x21], #0x1\n"
+ "ldr b19, [x23], #0x1\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x22], #0x1\n"
"ldr b16, [x20], #0x1\n"
- "zip1 v21.16b, v25.16b, v21.16b\n"
- "zip1 v20.16b, v27.16b, v20.16b\n"
- "zip1 v19.16b, v26.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "str s21, [x15, #0x0]\n"
- "str s17, [x15, #0x80]\n"
- "zip1 v17.16b, v20.16b, v19.16b\n"
- "zip1 v16.16b, v18.16b, v16.16b\n"
- "str s17, [x15, #0x100]\n"
- "str s16, [x15, #0x180]\n"
- "add x15, x15, #0x4\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str s22, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str s21, [x21, #0x80]\n"
+ "str s18, [x21, #0x100]\n"
+ "str s16, [x21, #0x180]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x10\n"
"add %x[out], %x[out], #0x200\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x17, %x[in]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x14, x15, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x15, %x[out]\n"
- "add x14, x17, %x[in_stride]\n"
- "add x13, x14, %x[in_stride]\n"
- "add x12, x13, %x[in_stride]\n"
- "csel x13, x13, %x[pad_row], GE\n"
- "add %x[in], x12, %x[in_stride]\n"
- "csel x12, x12, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x14, %x[in_stride]\n"
"csel x14, x14, %x[pad_row], GT\n"
+ "csel x15, x15, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x16, x16, %x[pad_row], GT\n"
"cmp x20, #0x20\n"
- "blt 14f\n"
- "13:" // Tail row loop: Column loop
- "ldr q20, [x17], #0x10\n"
- "ldr q19, [x14], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
+ "ldr q19, [x17], #0x10\n"
+ "ldr q18, [x16], #0x10\n"
"sub x20, x20, #0x20\n"
- "ldr q18, [x13], #0x10\n"
- "ldr q17, [x12], #0x10\n"
"cmp x20, #0x20\n"
- "ldr q24, [x17], #0x10\n"
- "ldr q25, [x14], #0x10\n"
- "ldr q23, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
- "zip1 v22.16b, v20.16b, v18.16b\n"
- "zip1 v21.16b, v19.16b, v17.16b\n"
- "zip2 v20.16b, v20.16b, v18.16b\n"
- "zip2 v19.16b, v19.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v23.16b\n"
- "zip1 v17.16b, v25.16b, v16.16b\n"
- "zip2 v24.16b, v24.16b, v23.16b\n"
- "zip2 v16.16b, v25.16b, v16.16b\n"
- "zip1 v23.16b, v22.16b, v21.16b\n"
- "zip2 v22.16b, v22.16b, v21.16b\n"
- "zip1 v21.16b, v20.16b, v19.16b\n"
+ "ldr q17, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
+ "zip1 v25.16b, v19.16b, v17.16b\n"
+ "zip1 v24.16b, v18.16b, v16.16b\n"
+ "ldr q22, [x17], #0x10\n"
+ "ldr q21, [x16], #0x10\n"
+ "zip2 v20.16b, v19.16b, v17.16b\n"
+ "zip2 v19.16b, v18.16b, v16.16b\n"
+ "ldr q17, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
+ "zip1 v23.16b, v22.16b, v17.16b\n"
+ "zip1 v18.16b, v21.16b, v16.16b\n"
+ "zip2 v22.16b, v22.16b, v17.16b\n"
+ "zip2 v21.16b, v21.16b, v16.16b\n"
+ "zip1 v16.16b, v25.16b, v24.16b\n"
+ "zip2 v17.16b, v25.16b, v24.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v16.16b, v20.16b, v19.16b\n"
"zip2 v20.16b, v20.16b, v19.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
- "zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v24.16b, v16.16b\n"
- "zip2 v16.16b, v24.16b, v16.16b\n"
- "str q23, [x15, #0x0]\n"
- "str q22, [x15, #0x10]\n"
- "str q21, [x15, #0x20]\n"
- "str q20, [x15, #0x30]\n"
- "str q19, [x15, #0x40]\n"
- "str q18, [x15, #0x50]\n"
- "str q17, [x15, #0x60]\n"
- "str q16, [x15, #0x70]\n"
- "add x15, x15, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q17, [x21, #0x10]\n"
+ "zip1 v19.16b, v23.16b, v18.16b\n"
+ "zip2 v18.16b, v23.16b, v18.16b\n"
+ "str q16, [x21, #0x20]\n"
+ "zip1 v17.16b, v22.16b, v21.16b\n"
+ "zip2 v16.16b, v22.16b, v21.16b\n"
+ "str q20, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
- "str q16, [x15, #0x10]\n"
- "str q16, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "str q16, [x15, #0x40]\n"
- "str q16, [x15, #0x50]\n"
- "str q16, [x15, #0x60]\n"
- "str q16, [x15, #0x70]\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 16 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 16 loop: loop
"ldr q20, [x17], #0x10\n"
- "ldr q21, [x14], #0x10\n"
+ "ldr q21, [x16], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q19, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
"cmp x20, #0x10\n"
+ "ldr q19, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
"zip1 v18.16b, v20.16b, v19.16b\n"
"zip1 v17.16b, v21.16b, v16.16b\n"
"zip2 v20.16b, v20.16b, v19.16b\n"
- "zip2 v16.16b, v21.16b, v16.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
+ "zip2 v19.16b, v21.16b, v16.16b\n"
+ "zip1 v16.16b, v18.16b, v17.16b\n"
"zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v20.16b, v16.16b\n"
- "zip2 v16.16b, v20.16b, v16.16b\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "str q17, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, #0x40\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 16 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "zip1 v17.16b, v20.16b, v19.16b\n"
+ "zip2 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, #0x40\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr s19, [x17], #0x4\n"
- "ldr s18, [x14], #0x4\n"
+ "ldr s18, [x16], #0x4\n"
"sub x20, x20, #0x4\n"
- "ldr s17, [x13], #0x4\n"
- "ldr s16, [x12], #0x4\n"
"cmp x20, #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr s16, [x14], #0x4\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr b19, [x17], #0x1\n"
- "ldr b18, [x14], #0x1\n"
+ "ldr b18, [x16], #0x1\n"
"sub x20, x20, #0x1\n"
- "ldr b17, [x13], #0x1\n"
- "ldr b16, [x12], #0x1\n"
"cmp x20, #0x1\n"
+ "ldr b17, [x15], #0x1\n"
+ "ldr b16, [x14], #0x1\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str s16, [x15, #0x0]\n"
- "add x15, x15, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x80\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
index f2c6b692a7..747d4538bd 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
@@ -40,422 +40,392 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 13f\n"
+ "blt 12f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x40\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q8, [x25], #0x10\n"
- "ldr q6, [x22], #0x10\n"
+ "ldr q14, [x25], #0x10\n"
+ "ldr q10, [x23], #0x10\n"
"sub x24, x24, #0x40\n"
- "ldr q2, [x21], #0x10\n"
+ "zip1 v12.8h, v14.8h, v10.8h\n"
+ "ldr q5, [x22], #0x10\n"
+ "ldr q3, [x20], #0x10\n"
+ "zip2 v31.8h, v14.8h, v10.8h\n"
+ "zip1 v19.8h, v5.8h, v3.8h\n"
+ "ldr q27, [x25], #0x10\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v11.8h, v27.8h, v25.8h\n"
+ "zip2 v24.8h, v27.8h, v25.8h\n"
+ "ldr q6, [x22], #0x10\n"
+ "ldr q29, [x20], #0x10\n"
+ "zip2 v15.8h, v5.8h, v3.8h\n"
+ "zip1 v18.8h, v6.8h, v29.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q9, [x23], #0x10\n"
+ "zip1 v0.8h, v17.8h, v9.8h\n"
+ "zip2 v9.8h, v17.8h, v9.8h\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
+ "zip2 v8.8h, v6.8h, v29.8h\n"
+ "zip1 v30.8h, v21.8h, v20.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q5, [x23], #0x10\n"
+ "zip1 v13.8h, v17.8h, v5.8h\n"
+ "zip2 v25.8h, v17.8h, v5.8h\n"
+ "ldr q7, [x22], #0x10\n"
+ "ldr q29, [x20], #0x10\n"
+ "zip2 v27.8h, v21.8h, v20.8h\n"
+ "zip1 v14.8h, v7.8h, v29.8h\n"
+ "ldr q28, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
+ "zip2 v1.8h, v7.8h, v29.8h\n"
"cmp x24, #0x40\n"
+ "ldr q10, [x22], #0x10\n"
+ "ldr q21, [x20], #0x10\n"
+ "zip1 v16.8h, v28.8h, v17.8h\n"
+ "zip2 v17.8h, v28.8h, v17.8h\n"
+ "ldr q5, [x25], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip1 v3.8h, v5.8h, v20.8h\n"
+ "zip2 v7.8h, v5.8h, v20.8h\n"
+ "ldr q22, [x22], #0x10\n"
+ "ldr q29, [x20], #0x10\n"
+ "zip1 v2.8h, v10.8h, v21.8h\n"
+ "zip2 v5.8h, v10.8h, v21.8h\n"
"ldr q21, [x25], #0x10\n"
- "ldr q28, [x22], #0x10\n"
- "ldr q23, [x21], #0x10\n"
- "zip1 v11.8h, v8.8h, v6.8h\n"
- "zip2 v8.8h, v8.8h, v6.8h\n"
- "ldr q1, [x20], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "zip1 v12.8h, v2.8h, v20.8h\n"
- "zip2 v14.8h, v2.8h, v20.8h\n"
- "ldr q24, [x22], #0x10\n"
- "ldr q20, [x21], #0x10\n"
- "zip1 v13.8h, v21.8h, v28.8h\n"
- "zip2 v15.8h, v21.8h, v28.8h\n"
- "ldr q17, [x20], #0x10\n"
- "ldr q19, [x25], #0x10\n"
- "zip1 v6.8h, v23.8h, v1.8h\n"
- "zip2 v28.8h, v23.8h, v1.8h\n"
- "ldr q16, [x22], #0x10\n"
- "ldr q30, [x21], #0x10\n"
- "zip1 v7.8h, v18.8h, v24.8h\n"
- "zip2 v9.8h, v18.8h, v24.8h\n"
- "ldr q18, [x20], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip1 v5.8h, v20.8h, v17.8h\n"
- "zip2 v3.8h, v20.8h, v17.8h\n"
- "ldr q23, [x22], #0x10\n"
- "ldr q26, [x21], #0x10\n"
- "zip1 v25.8h, v19.8h, v16.8h\n"
- "zip2 v10.8h, v19.8h, v16.8h\n"
- "ldr q16, [x20], #0x10\n"
- "ldr q19, [x25], #0x10\n"
- "zip1 v2.8h, v30.8h, v18.8h\n"
- "zip2 v4.8h, v30.8h, v18.8h\n"
+ "ldr q20, [x23], #0x10\n"
+ "zip1 v4.8h, v21.8h, v20.8h\n"
+ "zip2 v28.8h, v21.8h, v20.8h\n"
+ "ldr q6, [x22], #0x10\n"
+ "ldr q10, [x20], #0x10\n"
+ "zip1 v26.8h, v22.8h, v29.8h\n"
+ "zip2 v20.8h, v22.8h, v29.8h\n"
+ "ldr q29, [x25], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip1 v21.8h, v29.8h, v23.8h\n"
+ "zip2 v23.8h, v29.8h, v23.8h\n"
"ldr q22, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
- "zip1 v27.8h, v21.8h, v23.8h\n"
- "zip2 v20.8h, v21.8h, v23.8h\n"
- "ldr q1, [x20], #0x10\n"
- "ldr q23, [x25], #0x10\n"
- "zip1 v0.8h, v26.8h, v16.8h\n"
- "zip2 v31.8h, v26.8h, v16.8h\n"
- "ldr q16, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
- "zip1 v30.8h, v19.8h, v22.8h\n"
- "zip2 v29.8h, v19.8h, v22.8h\n"
- "ldr q19, [x20], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "zip1 v24.8h, v17.8h, v1.8h\n"
- "zip2 v26.8h, v17.8h, v1.8h\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q1, [x21], #0x10\n"
- "zip1 v22.8h, v23.8h, v16.8h\n"
- "zip2 v23.8h, v23.8h, v16.8h\n"
- "ldr q16, [x20], #0x10\n"
- "str q11, [x23, #0x0]\n"
- "zip1 v11.8h, v21.8h, v19.8h\n"
- "zip2 v21.8h, v21.8h, v19.8h\n"
- "str q8, [x23, #0x10]\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "str q13, [x23, #0x20]\n"
- "zip1 v17.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v1.8h, v16.8h\n"
- "str q15, [x23, #0x30]\n"
- "str q7, [x23, #0x40]\n"
- "str q9, [x23, #0x50]\n"
- "str q25, [x23, #0x60]\n"
- "str q10, [x23, #0x70]\n"
- "str q12, [x23, #0x80]\n"
- "str q14, [x23, #0x90]\n"
- "str q6, [x23, #0xa0]\n"
- "str q28, [x23, #0xb0]\n"
- "str q5, [x23, #0xc0]\n"
- "str q3, [x23, #0xd0]\n"
- "str q2, [x23, #0xe0]\n"
- "str q4, [x23, #0xf0]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q27, [x23, #0x0]\n"
- "str q20, [x23, #0x10]\n"
- "str q30, [x23, #0x20]\n"
- "str q29, [x23, #0x30]\n"
- "str q22, [x23, #0x40]\n"
- "str q23, [x23, #0x50]\n"
- "str q19, [x23, #0x60]\n"
- "str q18, [x23, #0x70]\n"
- "str q0, [x23, #0x80]\n"
- "str q31, [x23, #0x90]\n"
- "str q24, [x23, #0xa0]\n"
- "str q26, [x23, #0xb0]\n"
- "str q11, [x23, #0xc0]\n"
- "str q21, [x23, #0xd0]\n"
- "str q17, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "ldr q29, [x20], #0x10\n"
+ "str q12, [x21, #0x0]\n"
+ "zip1 v12.8h, v6.8h, v10.8h\n"
+ "str q31, [x21, #0x10]\n"
+ "zip2 v6.8h, v6.8h, v10.8h\n"
+ "zip1 v31.8h, v22.8h, v29.8h\n"
+ "str q11, [x21, #0x20]\n"
+ "zip2 v11.8h, v22.8h, v29.8h\n"
+ "str q24, [x21, #0x30]\n"
+ "str q0, [x21, #0x40]\n"
+ "str q9, [x21, #0x50]\n"
+ "str q13, [x21, #0x60]\n"
+ "str q25, [x21, #0x70]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q15, [x21, #0x90]\n"
+ "str q18, [x21, #0xa0]\n"
+ "str q8, [x21, #0xb0]\n"
+ "str q30, [x21, #0xc0]\n"
+ "str q27, [x21, #0xd0]\n"
+ "str q14, [x21, #0xe0]\n"
+ "str q1, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q16, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q3, [x21, #0x20]\n"
+ "str q7, [x21, #0x30]\n"
+ "str q4, [x21, #0x40]\n"
+ "str q28, [x21, #0x50]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q23, [x21, #0x70]\n"
+ "str q2, [x21, #0x80]\n"
+ "str q5, [x21, #0x90]\n"
+ "str q26, [x21, #0xa0]\n"
+ "str q20, [x21, #0xb0]\n"
+ "str q12, [x21, #0xc0]\n"
+ "str q6, [x21, #0xd0]\n"
+ "str q31, [x21, #0xe0]\n"
+ "str q11, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cmp x24, #0x20\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr q21, [x25], #0x10\n"
- "ldr q17, [x22], #0x10\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
"sub x24, x24, #0x20\n"
- "ldr q20, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
"cmp x24, #0x20\n"
- "ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "zip1 v0.8h, v21.8h, v17.8h\n"
- "zip2 v31.8h, v21.8h, v17.8h\n"
- "ldr q17, [x20], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip1 v30.8h, v20.8h, v16.8h\n"
- "zip2 v29.8h, v20.8h, v16.8h\n"
- "ldr q16, [x22], #0x10\n"
- "ldr q20, [x21], #0x10\n"
- "zip1 v28.8h, v19.8h, v18.8h\n"
- "zip2 v27.8h, v19.8h, v18.8h\n"
+ "ldr q21, [x22], #0x10\n"
+ "ldr q18, [x20], #0x10\n"
+ "zip1 v1.8h, v17.8h, v16.8h\n"
+ "zip2 v0.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v31.8h, v17.8h, v16.8h\n"
+ "zip2 v30.8h, v17.8h, v16.8h\n"
+ "ldr q20, [x22], #0x10\n"
"ldr q19, [x20], #0x10\n"
- "ldr q18, [x25], #0x10\n"
- "zip1 v26.8h, v22.8h, v17.8h\n"
- "zip2 v25.8h, v22.8h, v17.8h\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q24, [x21], #0x10\n"
- "zip1 v23.8h, v21.8h, v16.8h\n"
- "zip2 v22.8h, v21.8h, v16.8h\n"
+ "zip1 v29.8h, v21.8h, v18.8h\n"
+ "zip2 v28.8h, v21.8h, v18.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v27.8h, v17.8h, v16.8h\n"
+ "zip2 v26.8h, v17.8h, v16.8h\n"
+ "ldr q25, [x22], #0x10\n"
+ "ldr q18, [x20], #0x10\n"
+ "zip1 v24.8h, v20.8h, v19.8h\n"
+ "zip2 v23.8h, v20.8h, v19.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v22.8h, v17.8h, v16.8h\n"
+ "zip2 v21.8h, v17.8h, v16.8h\n"
+ "ldr q20, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "str q0, [x23, #0x0]\n"
- "zip1 v19.8h, v18.8h, v17.8h\n"
- "zip2 v18.8h, v18.8h, v17.8h\n"
- "str q31, [x23, #0x10]\n"
- "zip1 v17.8h, v24.8h, v16.8h\n"
- "zip2 v16.8h, v24.8h, v16.8h\n"
- "str q28, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q23, [x23, #0x40]\n"
- "str q22, [x23, #0x50]\n"
- "str q19, [x23, #0x60]\n"
- "str q18, [x23, #0x70]\n"
- "str q30, [x23, #0x80]\n"
- "str q29, [x23, #0x90]\n"
- "str q26, [x23, #0xa0]\n"
- "str q25, [x23, #0xb0]\n"
- "str q21, [x23, #0xc0]\n"
- "str q20, [x23, #0xd0]\n"
- "str q17, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "zip1 v19.8h, v25.8h, v18.8h\n"
+ "zip2 v18.8h, v25.8h, v18.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
+ "str q1, [x21, #0x0]\n"
+ "str q0, [x21, #0x10]\n"
+ "str q31, [x21, #0x20]\n"
+ "str q30, [x21, #0x30]\n"
+ "str q27, [x21, #0x40]\n"
+ "str q26, [x21, #0x50]\n"
+ "str q22, [x21, #0x60]\n"
+ "str q21, [x21, #0x70]\n"
+ "str q29, [x21, #0x80]\n"
+ "str q28, [x21, #0x90]\n"
+ "str q24, [x21, #0xa0]\n"
+ "str q23, [x21, #0xb0]\n"
+ "str q19, [x21, #0xc0]\n"
+ "str q18, [x21, #0xd0]\n"
+ "str q17, [x21, #0xe0]\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x24, 12f\n"
"cmp x24, #0x10\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "str q16, [x23, #0xc0]\n"
- "str q16, [x23, #0xd0]\n"
- "str q16, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
"blt 7f\n"
"6:" // Main row loop: width 16 loop: loop
"ldr q17, [x25], #0x10\n"
- "ldr q16, [x22], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
"sub x24, x24, #0x10\n"
- "ldr q20, [x21], #0x10\n"
- "ldr q19, [x20], #0x10\n"
"cmp x24, #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q23, [x21], #0x10\n"
+ "ldr q24, [x22], #0x10\n"
+ "ldr q23, [x20], #0x10\n"
+ "zip1 v19.8h, v17.8h, v16.8h\n"
+ "zip2 v18.8h, v17.8h, v16.8h\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
"zip1 v22.8h, v17.8h, v16.8h\n"
- "zip2 v17.8h, v17.8h, v16.8h\n"
+ "zip2 v21.8h, v17.8h, v16.8h\n"
+ "ldr q20, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "zip1 v19.8h, v24.8h, v18.8h\n"
- "zip2 v18.8h, v24.8h, v18.8h\n"
- "str q22, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "zip1 v17.8h, v23.8h, v16.8h\n"
- "zip2 v16.8h, v23.8h, v16.8h\n"
- "str q19, [x23, #0x20]\n"
- "str q18, [x23, #0x30]\n"
- "str q21, [x23, #0x80]\n"
- "str q20, [x23, #0x90]\n"
- "str q17, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "add x23, x23, #0x40\n"
+ "str q19, [x21, #0x0]\n"
+ "zip1 v19.8h, v24.8h, v23.8h\n"
+ "str q18, [x21, #0x10]\n"
+ "zip2 v18.8h, v24.8h, v23.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q22, [x21, #0x20]\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
+ "str q21, [x21, #0x30]\n"
+ "str q19, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q17, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, #0x40\n"
"bge 6b\n"
"7:" // Main row loop: width 16 loop: skip
"cmp x24, #0x4\n"
"blt 9f\n"
"8:" // Main row loop: width 4 loop: loop
"ldr d19, [x25], #0x8\n"
- "ldr d16, [x22], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"sub x24, x24, #0x4\n"
- "ldr d18, [x21], #0x8\n"
- "ldr d17, [x20], #0x8\n"
"cmp x24, #0x4\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d17, [x20], #0x8\n"
"zip1 v16.8h, v19.8h, v16.8h\n"
- "str q16, [x23, #0x0]\n"
+ "str q16, [x21, #0x0]\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [x23, #0x80]\n"
- "add x23, x23, #0x10\n"
+ "str q16, [x21, #0x80]\n"
+ "add x21, x21, #0x10\n"
"bge 8b\n"
"9:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 11f\n"
"10:" // Main row loop: width 1 loop: loop
"ldr h19, [x25], #0x2\n"
- "ldr h16, [x22], #0x2\n"
+ "ldr h16, [x23], #0x2\n"
"sub x24, x24, #0x1\n"
- "ldr h18, [x21], #0x2\n"
- "ldr h17, [x20], #0x2\n"
"cmp x24, #0x1\n"
+ "ldr h18, [x22], #0x2\n"
+ "ldr h17, [x20], #0x2\n"
"zip1 v16.8h, v19.8h, v16.8h\n"
- "str s16, [x23, #0x0]\n"
+ "str s16, [x21, #0x0]\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
- "str s16, [x23, #0x80]\n"
- "add x23, x23, #0x4\n"
+ "str s16, [x21, #0x80]\n"
+ "add x21, x21, #0x4\n"
"bge 10b\n"
"11:" // Main row loop: width 1 loop: skip
- "12:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x100\n"
"bge 1b\n"
- "cbz %x[height], 26f\n"
- "13:" // Main loop skip
- "14:" // Tail row loop: Head
+ "cbz %x[height], 24f\n"
+ "12:" // Main loop skip
+ "13:" // Tail row loop: Head
"mov x25, %x[in]\n"
"mov x20, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x22, x25, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
- "csel x22, x22, %x[pad_row], GT\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
"cmp x20, #0x40\n"
- "blt 16f\n"
- "15:" // Tail row loop: Unroll column loop
- "ldr q21, [x25], #0x10\n"
- "ldr q17, [x22], #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "blt 15f\n"
+ "14:" // Tail row loop: Unroll column loop
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
"sub x20, x20, #0x40\n"
- "ldr q20, [x25], #0x10\n"
+ "zip1 v0.8h, v18.8h, v17.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip2 v31.8h, v18.8h, v17.8h\n"
+ "zip1 v30.8h, v19.8h, v16.8h\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
+ "zip2 v29.8h, v19.8h, v16.8h\n"
+ "zip1 v28.8h, v18.8h, v17.8h\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip2 v27.8h, v18.8h, v17.8h\n"
+ "zip1 v26.8h, v19.8h, v16.8h\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
+ "zip2 v25.8h, v19.8h, v16.8h\n"
"cmp x20, #0x40\n"
- "ldr q16, [x22], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q22, [x25], #0x10\n"
- "zip1 v0.8h, v21.8h, v17.8h\n"
- "zip2 v31.8h, v21.8h, v17.8h\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip1 v30.8h, v20.8h, v16.8h\n"
- "zip2 v29.8h, v20.8h, v16.8h\n"
- "ldr q16, [x22], #0x10\n"
- "ldr q20, [x25], #0x10\n"
- "zip1 v28.8h, v19.8h, v18.8h\n"
- "zip2 v27.8h, v19.8h, v18.8h\n"
- "ldr q19, [x22], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v24.8h, v18.8h, v17.8h\n"
+ "zip2 v23.8h, v18.8h, v17.8h\n"
"ldr q18, [x25], #0x10\n"
- "zip1 v26.8h, v22.8h, v17.8h\n"
- "zip2 v25.8h, v22.8h, v17.8h\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "zip1 v23.8h, v21.8h, v16.8h\n"
- "zip2 v22.8h, v21.8h, v16.8h\n"
- "ldr q16, [x22], #0x10\n"
- "str q0, [x23, #0x0]\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
- "str q31, [x23, #0x10]\n"
+ "ldr q17, [x23], #0x10\n"
+ "zip1 v22.8h, v19.8h, v16.8h\n"
+ "zip2 v21.8h, v19.8h, v16.8h\n"
+ "ldr q20, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "str q0, [x21, #0x0]\n"
"zip1 v19.8h, v18.8h, v17.8h\n"
+ "str q31, [x21, #0x10]\n"
"zip2 v18.8h, v18.8h, v17.8h\n"
- "str q30, [x23, #0x20]\n"
- "zip1 v17.8h, v24.8h, v16.8h\n"
- "zip2 v16.8h, v24.8h, v16.8h\n"
- "str q29, [x23, #0x30]\n"
- "str q28, [x23, #0x40]\n"
- "str q27, [x23, #0x50]\n"
- "str q26, [x23, #0x60]\n"
- "str q25, [x23, #0x70]\n"
- "add x23, x23, %x[out_stride]\n"
- "str q23, [x23, #0x0]\n"
- "str q22, [x23, #0x10]\n"
- "str q21, [x23, #0x20]\n"
- "str q20, [x23, #0x30]\n"
- "str q19, [x23, #0x40]\n"
- "str q18, [x23, #0x50]\n"
- "str q17, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 15b\n"
- "16:" // Tail row loop: Unroll column loop skip
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q30, [x21, #0x20]\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
+ "str q29, [x21, #0x30]\n"
+ "str q28, [x21, #0x40]\n"
+ "str q27, [x21, #0x50]\n"
+ "str q26, [x21, #0x60]\n"
+ "str q25, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q23, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q21, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: Unroll column loop skip
"cmp x20, #0x20\n"
- "blt 18f\n"
- "17:" // Tail row loop: Column loop
- "ldr q21, [x25], #0x10\n"
- "ldr q16, [x22], #0x10\n"
+ "blt 17f\n"
+ "16:" // Tail row loop: Column loop
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
"sub x20, x20, #0x20\n"
- "ldr q20, [x25], #0x10\n"
"cmp x20, #0x20\n"
- "ldr q19, [x22], #0x10\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
+ "zip1 v24.8h, v18.8h, v17.8h\n"
+ "zip2 v23.8h, v18.8h, v17.8h\n"
"ldr q18, [x25], #0x10\n"
- "ldr q17, [x22], #0x10\n"
- "ldr q24, [x25], #0x10\n"
- "zip1 v23.8h, v21.8h, v16.8h\n"
- "zip2 v22.8h, v21.8h, v16.8h\n"
- "ldr q16, [x22], #0x10\n"
- "zip1 v21.8h, v20.8h, v19.8h\n"
- "zip2 v20.8h, v20.8h, v19.8h\n"
+ "ldr q17, [x23], #0x10\n"
+ "zip1 v22.8h, v19.8h, v16.8h\n"
+ "zip2 v21.8h, v19.8h, v16.8h\n"
+ "ldr q20, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
"zip1 v19.8h, v18.8h, v17.8h\n"
"zip2 v18.8h, v18.8h, v17.8h\n"
- "zip1 v17.8h, v24.8h, v16.8h\n"
- "zip2 v16.8h, v24.8h, v16.8h\n"
- "str q23, [x23, #0x0]\n"
- "str q22, [x23, #0x10]\n"
- "str q21, [x23, #0x20]\n"
- "str q20, [x23, #0x30]\n"
- "str q19, [x23, #0x40]\n"
- "str q18, [x23, #0x50]\n"
- "str q17, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 17b\n"
- "18:" // Tail row loop: Column loop skip
- "cbz x20, 25f\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
+ "str q24, [x21, #0x0]\n"
+ "str q23, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q21, [x21, #0x30]\n"
+ "str q19, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q17, [x21, #0x60]\n"
+ "str q16, [x21, #0x70]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 16 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 16 loop: loop
"ldr q18, [x25], #0x10\n"
- "ldr q17, [x22], #0x10\n"
+ "ldr q17, [x23], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q20, [x25], #0x10\n"
"cmp x20, #0x10\n"
- "ldr q16, [x22], #0x10\n"
+ "ldr q20, [x25], #0x10\n"
+ "ldr q16, [x23], #0x10\n"
"zip1 v19.8h, v18.8h, v17.8h\n"
"zip2 v18.8h, v18.8h, v17.8h\n"
"zip1 v17.8h, v20.8h, v16.8h\n"
"zip2 v16.8h, v20.8h, v16.8h\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "add x23, x23, #0x40\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 16 loop: skip
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, #0x40\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 22f\n"
- "21:" // Tail row loop: width 4 loop: loop
+ "blt 21f\n"
+ "20:" // Tail row loop: width 4 loop: loop
"ldr d17, [x25], #0x8\n"
- "ldr d16, [x22], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str q16, [x23, #0x0]\n"
- "add x23, x23, #0x10\n"
- "bge 21b\n"
- "22:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 20b\n"
+ "21:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 24f\n"
- "23:" // Tail row loop: width 1 loop: loop
+ "blt 23f\n"
+ "22:" // Tail row loop: width 1 loop: loop
"ldr h17, [x25], #0x2\n"
- "ldr h16, [x22], #0x2\n"
+ "ldr h16, [x23], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
"zip1 v16.8h, v17.8h, v16.8h\n"
- "str s16, [x23, #0x0]\n"
- "add x23, x23, #0x4\n"
- "bge 23b\n"
- "24:" // Tail row loop: width 1 loop: skip
- "25:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 22b\n"
+ "23:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x80\n"
- "bge 14b\n"
- "26:" // Done
+ "bge 13b\n"
+ "24:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
index 44b6e7c9d4..222551909b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
@@ -34,186 +34,165 @@ void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
"ldr q27, [x25], #0x10\n"
- "ldr q26, [x22], #0x10\n"
+ "ldr q26, [x23], #0x10\n"
"sub x24, x24, #0x18\n"
- "ldr q25, [x21], #0x10\n"
- "ldr q24, [x20], #0x10\n"
"cmp x24, #0x18\n"
+ "ldr q25, [x22], #0x10\n"
+ "ldr q24, [x20], #0x10\n"
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q27, [x23, #0x0]\n"
- "str q23, [x23, #0x10]\n"
- "str q19, [x23, #0x20]\n"
- "str q26, [x23, #0x30]\n"
- "str q22, [x23, #0x40]\n"
- "str q18, [x23, #0x50]\n"
- "str q25, [x23, #0x60]\n"
- "str q21, [x23, #0x70]\n"
- "str q17, [x23, #0x80]\n"
- "str q24, [x23, #0x90]\n"
- "str q20, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "str q27, [x21, #0x0]\n"
+ "str q23, [x21, #0x10]\n"
+ "str q19, [x21, #0x20]\n"
+ "str q26, [x21, #0x30]\n"
+ "str q22, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q25, [x21, #0x60]\n"
+ "str q21, [x21, #0x70]\n"
+ "str q17, [x21, #0x80]\n"
+ "str q24, [x21, #0x90]\n"
+ "str q20, [x21, #0xa0]\n"
+ "str q16, [x21, #0xb0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x10\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
"blt 5f\n"
"4:" // Main row loop: width 16 loop: loop
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
"sub x24, x24, #0x10\n"
- "ldr q21, [x21], #0x10\n"
- "ldr q20, [x20], #0x10\n"
"cmp x24, #0x10\n"
+ "ldr q21, [x22], #0x10\n"
+ "ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q23, [x23, #0x0]\n"
- "str q19, [x23, #0x10]\n"
- "str q22, [x23, #0x30]\n"
- "str q18, [x23, #0x40]\n"
- "str q21, [x23, #0x60]\n"
- "str q17, [x23, #0x70]\n"
- "str q20, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "add x23, x23, #0x20\n"
+ "str q23, [x21, #0x0]\n"
+ "str q19, [x21, #0x10]\n"
+ "str q22, [x21, #0x30]\n"
+ "str q18, [x21, #0x40]\n"
+ "str q21, [x21, #0x60]\n"
+ "str q17, [x21, #0x70]\n"
+ "str q20, [x21, #0x90]\n"
+ "str q16, [x21, #0xa0]\n"
+ "add x21, x21, #0x20\n"
"bge 4b\n"
"5:" // Main row loop: width 16 loop: skip
"cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr d19, [x25], #0x8\n"
- "ldr d18, [x22], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"sub x24, x24, #0x4\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
"cmp x24, #0x4\n"
- "str d19, [x23, #0x0]\n"
- "str d18, [x23, #0x30]\n"
- "str d17, [x23, #0x60]\n"
- "str d16, [x23, #0x90]\n"
- "add x23, x23, #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "str d19, [x21, #0x0]\n"
+ "str d18, [x21, #0x30]\n"
+ "str d17, [x21, #0x60]\n"
+ "str d16, [x21, #0x90]\n"
+ "add x21, x21, #0x8\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr h19, [x25], #0x2\n"
- "ldr h18, [x22], #0x2\n"
+ "ldr h18, [x23], #0x2\n"
"sub x24, x24, #0x1\n"
- "ldr h17, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
"cmp x24, #0x1\n"
- "str h19, [x23, #0x0]\n"
- "str h18, [x23, #0x30]\n"
- "str h17, [x23, #0x60]\n"
- "str h16, [x23, #0x90]\n"
- "add x23, x23, #0x2\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "str h19, [x21, #0x0]\n"
+ "str h18, [x21, #0x30]\n"
+ "str h17, [x21, #0x60]\n"
+ "str h16, [x21, #0x90]\n"
+ "add x21, x21, #0x2\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0xc0\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
"ldr q18, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
"ldr q17, [x25], #0x10\n"
- "ldr q16, [x25], #0x10\n"
+ "sub x20, x20, #0x18\n"
"cmp x20, #0x18\n"
- "str q18, [x23, #0x0]\n"
- "str q17, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "ldr q16, [x25], #0x10\n"
+ "str q18, [x21, #0x0]\n"
+ "str q17, [x21, #0x10]\n"
+ "str q16, [x21, #0x20]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 16 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 16 loop: loop
"ldr q17, [x25], #0x10\n"
- "sub x20, x20, #0x10\n"
"ldr q16, [x25], #0x10\n"
+ "sub x20, x20, #0x10\n"
"cmp x20, #0x10\n"
- "str q17, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "add x23, x23, #0x20\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 16 loop: skip
+ "str q17, [x21, #0x0]\n"
+ "str q16, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d16, [x25], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
- "str d16, [x23, #0x0]\n"
- "add x23, x23, #0x8\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h16, [x25], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
- "str h16, [x23, #0x0]\n"
- "add x23, x23, #0x2\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str h16, [x21, #0x0]\n"
+ "add x21, x21, #0x2\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x30\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
index ed12f5dfa9..7b9c7ecb30 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
@@ -41,10 +41,9 @@ void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t wid
__asm__ __volatile__(
"1:" // Main row loop: Head
"mov x17, %x[in]\n"
- "cmp %x[height], #0xf\n"
- "mov x16, %x[width]\n"
- "mov x15, %x[out]\n"
- "add x14, x17, %x[in_stride]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
+ "add x14, x15, %x[in_stride]\n"
"add x13, x14, %x[in_stride]\n"
"add x12, x13, %x[in_stride]\n"
"add x11, x12, %x[in_stride]\n"
@@ -57,231 +56,228 @@ void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t wid
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GE\n"
- "add %x[in], x20, %x[in_stride]\n"
- "csel x20, x20, %x[pad_row], GT\n"
- "cmp %x[height], #0xd\n"
+ "cmp %x[height], #0xf\n"
+ "add %x[in], x22, %x[in_stride]\n"
"csel x22, x22, %x[pad_row], GT\n"
"csel x23, x23, %x[pad_row], GE\n"
- "cmp %x[height], #0xb\n"
+ "cmp %x[height], #0xd\n"
"csel x24, x24, %x[pad_row], GT\n"
"csel x25, x25, %x[pad_row], GE\n"
- "cmp %x[height], #0x9\n"
+ "cmp %x[height], #0xb\n"
"csel x26, x26, %x[pad_row], GT\n"
"csel x27, x27, %x[pad_row], GE\n"
- "cmp %x[height], #0x7\n"
+ "cmp %x[height], #0x9\n"
"csel x28, x28, %x[pad_row], GT\n"
"csel x9, x9, %x[pad_row], GE\n"
- "cmp %x[height], #0x5\n"
+ "cmp %x[height], #0x7\n"
"csel x10, x10, %x[pad_row], GT\n"
"csel x11, x11, %x[pad_row], GE\n"
- "cmp %x[height], #0x3\n"
+ "cmp %x[height], #0x5\n"
+ "mov x21, %x[width]\n"
"csel x12, x12, %x[pad_row], GT\n"
"csel x13, x13, %x[pad_row], GE\n"
+ "cmp %x[height], #0x3\n"
+ "csel x14, x14, %x[pad_row], GT\n"
+ "csel x15, x15, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
+ "csel x16, x16, %x[pad_row], GT\n"
+ "cmp x21, #0x10\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x10\n"
- "csel x14, x14, %x[pad_row], GT\n"
- "cmp x16, #0x10\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
"ldr q3, [x17], #0x10\n"
- "ldr q9, [x14], #0x10\n"
- "sub x16, x16, #0x10\n"
- "ldr q2, [x13], #0x10\n"
- "ldr q8, [x12], #0x10\n"
- "cmp x16, #0x10\n"
- "ldr q1, [x11], #0x10\n"
+ "ldr q9, [x16], #0x10\n"
+ "sub x21, x21, #0x10\n"
+ "cmp x21, #0x10\n"
+ "ldr q2, [x15], #0x10\n"
+ "ldr q8, [x14], #0x10\n"
+ "ldr q0, [x13], #0x10\n"
+ "ldr q31, [x12], #0x10\n"
+ "ldr q30, [x11], #0x10\n"
"ldr q7, [x10], #0x10\n"
- "ldr q0, [x9], #0x10\n"
- "ldr q6, [x28], #0x10\n"
- "ldr q31, [x27], #0x10\n"
- "ldr q30, [x26], #0x10\n"
- "ldr q29, [x25], #0x10\n"
- "ldr q28, [x24], #0x10\n"
- "ldr q26, [x23], #0x10\n"
- "ldr q25, [x22], #0x10\n"
- "ldr q24, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v23.16b, v3.16b, v31.16b\n"
- "zip1 v27.16b, v9.16b, v30.16b\n"
- "zip1 v22.16b, v2.16b, v29.16b\n"
- "zip1 v21.16b, v8.16b, v28.16b\n"
- "zip1 v20.16b, v1.16b, v26.16b\n"
- "zip1 v19.16b, v7.16b, v25.16b\n"
- "zip1 v18.16b, v0.16b, v24.16b\n"
- "zip1 v17.16b, v6.16b, v16.16b\n"
- "zip2 v5.16b, v3.16b, v31.16b\n"
- "zip2 v1.16b, v1.16b, v26.16b\n"
- "zip2 v4.16b, v2.16b, v29.16b\n"
- "zip2 v3.16b, v0.16b, v24.16b\n"
- "zip2 v2.16b, v9.16b, v30.16b\n"
- "zip2 v0.16b, v7.16b, v25.16b\n"
- "zip2 v31.16b, v8.16b, v28.16b\n"
- "zip2 v30.16b, v6.16b, v16.16b\n"
- "zip1 v26.16b, v23.16b, v20.16b\n"
- "zip1 v25.16b, v22.16b, v18.16b\n"
- "zip1 v24.16b, v27.16b, v19.16b\n"
- "zip1 v16.16b, v21.16b, v17.16b\n"
- "zip2 v29.16b, v23.16b, v20.16b\n"
- "zip2 v23.16b, v22.16b, v18.16b\n"
- "zip2 v22.16b, v27.16b, v19.16b\n"
- "zip2 v21.16b, v21.16b, v17.16b\n"
- "zip1 v28.16b, v5.16b, v1.16b\n"
- "zip1 v27.16b, v4.16b, v3.16b\n"
- "zip1 v20.16b, v2.16b, v0.16b\n"
- "zip1 v19.16b, v31.16b, v30.16b\n"
- "zip1 v18.16b, v26.16b, v25.16b\n"
- "zip1 v17.16b, v24.16b, v16.16b\n"
- "zip2 v26.16b, v26.16b, v25.16b\n"
- "zip2 v16.16b, v24.16b, v16.16b\n"
- "zip2 v1.16b, v5.16b, v1.16b\n"
- "zip2 v25.16b, v4.16b, v3.16b\n"
- "zip2 v0.16b, v2.16b, v0.16b\n"
- "zip2 v24.16b, v31.16b, v30.16b\n"
- "zip1 v31.16b, v29.16b, v23.16b\n"
- "zip1 v30.16b, v22.16b, v21.16b\n"
- "zip2 v29.16b, v29.16b, v23.16b\n"
- "zip2 v23.16b, v22.16b, v21.16b\n"
- "zip1 v22.16b, v28.16b, v27.16b\n"
- "zip1 v21.16b, v20.16b, v19.16b\n"
+ "ldr q29, [x9], #0x10\n"
+ "ldr q28, [x28], #0x10\n"
+ "zip1 v27.16b, v3.16b, v29.16b\n"
+ "zip1 v6.16b, v9.16b, v28.16b\n"
+ "ldr q25, [x27], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v26.16b, v2.16b, v25.16b\n"
+ "zip1 v1.16b, v8.16b, v24.16b\n"
+ "ldr q23, [x25], #0x10\n"
+ "ldr q22, [x24], #0x10\n"
+ "zip1 v21.16b, v0.16b, v23.16b\n"
+ "zip1 v20.16b, v31.16b, v22.16b\n"
+ "ldr q19, [x23], #0x10\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v17.16b, v30.16b, v19.16b\n"
+ "zip1 v16.16b, v7.16b, v18.16b\n"
+ "zip2 v5.16b, v3.16b, v29.16b\n"
+ "zip2 v0.16b, v0.16b, v23.16b\n"
+ "zip2 v4.16b, v2.16b, v25.16b\n"
+ "zip2 v3.16b, v30.16b, v19.16b\n"
+ "zip2 v2.16b, v9.16b, v28.16b\n"
+ "zip2 v31.16b, v31.16b, v22.16b\n"
+ "zip2 v30.16b, v8.16b, v24.16b\n"
+ "zip2 v29.16b, v7.16b, v18.16b\n"
+ "zip1 v25.16b, v27.16b, v21.16b\n"
+ "zip1 v24.16b, v26.16b, v17.16b\n"
+ "zip1 v23.16b, v6.16b, v20.16b\n"
+ "zip1 v22.16b, v1.16b, v16.16b\n"
+ "zip2 v28.16b, v27.16b, v21.16b\n"
+ "zip2 v27.16b, v26.16b, v17.16b\n"
+ "zip2 v26.16b, v6.16b, v20.16b\n"
+ "zip2 v21.16b, v1.16b, v16.16b\n"
+ "zip1 v1.16b, v5.16b, v0.16b\n"
+ "zip1 v20.16b, v4.16b, v3.16b\n"
+ "zip1 v19.16b, v2.16b, v31.16b\n"
+ "zip1 v16.16b, v30.16b, v29.16b\n"
+ "zip1 v18.16b, v25.16b, v24.16b\n"
+ "zip1 v17.16b, v23.16b, v22.16b\n"
+ "zip2 v25.16b, v25.16b, v24.16b\n"
+ "zip2 v24.16b, v23.16b, v22.16b\n"
+ "zip2 v0.16b, v5.16b, v0.16b\n"
+ "zip2 v23.16b, v4.16b, v3.16b\n"
+ "zip2 v31.16b, v2.16b, v31.16b\n"
+ "zip2 v22.16b, v30.16b, v29.16b\n"
+ "zip1 v30.16b, v28.16b, v27.16b\n"
+ "zip1 v29.16b, v26.16b, v21.16b\n"
"zip2 v28.16b, v28.16b, v27.16b\n"
- "zip2 v20.16b, v20.16b, v19.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
+ "zip2 v27.16b, v26.16b, v21.16b\n"
+ "zip1 v26.16b, v1.16b, v20.16b\n"
+ "zip1 v21.16b, v19.16b, v16.16b\n"
+ "zip2 v20.16b, v1.16b, v20.16b\n"
+ "zip2 v19.16b, v19.16b, v16.16b\n"
+ "zip1 v16.16b, v18.16b, v17.16b\n"
"zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v26.16b, v16.16b\n"
- "zip2 v16.16b, v26.16b, v16.16b\n"
- "zip1 v27.16b, v1.16b, v25.16b\n"
- "zip1 v26.16b, v0.16b, v24.16b\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "zip2 v25.16b, v1.16b, v25.16b\n"
- "zip2 v24.16b, v0.16b, v24.16b\n"
- "str q17, [x15, #0x20]\n"
- "zip1 v19.16b, v31.16b, v30.16b\n"
- "zip2 v18.16b, v31.16b, v30.16b\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
- "zip1 v17.16b, v29.16b, v23.16b\n"
- "zip2 v16.16b, v29.16b, v23.16b\n"
- "zip1 v23.16b, v22.16b, v21.16b\n"
- "zip2 v22.16b, v22.16b, v21.16b\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "zip1 v21.16b, v28.16b, v20.16b\n"
- "zip2 v20.16b, v28.16b, v20.16b\n"
- "str q17, [x15, #0x20]\n"
- "zip1 v19.16b, v27.16b, v26.16b\n"
- "zip2 v18.16b, v27.16b, v26.16b\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
+ "str q16, [x20, #0x0]\n"
"zip1 v17.16b, v25.16b, v24.16b\n"
"zip2 v16.16b, v25.16b, v24.16b\n"
- "str q23, [x15, #0x0]\n"
- "str q22, [x15, #0x10]\n"
- "str q21, [x15, #0x20]\n"
- "str q20, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "str q17, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
+ "str q18, [x20, #0x10]\n"
+ "str q17, [x20, #0x20]\n"
+ "zip1 v25.16b, v0.16b, v23.16b\n"
+ "zip1 v24.16b, v31.16b, v22.16b\n"
+ "str q16, [x20, #0x30]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip2 v23.16b, v0.16b, v23.16b\n"
+ "zip2 v22.16b, v31.16b, v22.16b\n"
+ "zip1 v16.16b, v30.16b, v29.16b\n"
+ "zip2 v17.16b, v30.16b, v29.16b\n"
+ "str q16, [x20, #0x0]\n"
+ "zip1 v16.16b, v28.16b, v27.16b\n"
+ "zip2 v18.16b, v28.16b, v27.16b\n"
+ "str q17, [x20, #0x10]\n"
+ "str q16, [x20, #0x20]\n"
+ "zip1 v17.16b, v26.16b, v21.16b\n"
+ "zip2 v16.16b, v26.16b, v21.16b\n"
+ "str q18, [x20, #0x30]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip1 v21.16b, v20.16b, v19.16b\n"
+ "zip2 v20.16b, v20.16b, v19.16b\n"
+ "str q17, [x20, #0x0]\n"
+ "zip1 v19.16b, v25.16b, v24.16b\n"
+ "zip2 v18.16b, v25.16b, v24.16b\n"
+ "str q16, [x20, #0x10]\n"
+ "zip1 v17.16b, v23.16b, v22.16b\n"
+ "zip2 v16.16b, v23.16b, v22.16b\n"
+ "str q21, [x20, #0x20]\n"
+ "str q20, [x20, #0x30]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "str q19, [x20, #0x0]\n"
+ "str q18, [x20, #0x10]\n"
+ "str q17, [x20, #0x20]\n"
+ "str q16, [x20, #0x30]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x16, #0x4\n"
+ "cmp x21, #0x4\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr s31, [x17], #0x4\n"
- "ldr s30, [x14], #0x4\n"
- "sub x16, x16, #0x4\n"
- "ldr s29, [x13], #0x4\n"
- "ldr s28, [x12], #0x4\n"
- "cmp x16, #0x4\n"
- "ldr s27, [x11], #0x4\n"
- "ldr s26, [x10], #0x4\n"
- "ldr s25, [x9], #0x4\n"
- "ldr s24, [x28], #0x4\n"
- "ldr s20, [x27], #0x4\n"
- "ldr s21, [x26], #0x4\n"
- "ldr s23, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s19, [x23], #0x4\n"
- "ldr s18, [x22], #0x4\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
- "zip1 v20.16b, v31.16b, v20.16b\n"
- "zip1 v21.16b, v30.16b, v21.16b\n"
- "zip1 v23.16b, v29.16b, v23.16b\n"
- "zip1 v22.16b, v28.16b, v22.16b\n"
- "zip1 v19.16b, v27.16b, v19.16b\n"
- "zip1 v18.16b, v26.16b, v18.16b\n"
+ "ldr s21, [x17], #0x4\n"
+ "ldr s23, [x16], #0x4\n"
+ "sub x21, x21, #0x4\n"
+ "cmp x21, #0x4\n"
+ "ldr s20, [x15], #0x4\n"
+ "ldr s22, [x14], #0x4\n"
+ "ldr s19, [x13], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
+ "ldr s25, [x11], #0x4\n"
+ "ldr s24, [x10], #0x4\n"
+ "ldr s17, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "zip1 v21.16b, v21.16b, v17.16b\n"
+ "zip1 v23.16b, v23.16b, v16.16b\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
+ "zip1 v20.16b, v20.16b, v17.16b\n"
+ "zip1 v22.16b, v22.16b, v16.16b\n"
+ "ldr s17, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "zip1 v19.16b, v19.16b, v17.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "ldr s17, [x23], #0x4\n"
+ "ldr s16, [x22], #0x4\n"
"zip1 v17.16b, v25.16b, v17.16b\n"
"zip1 v16.16b, v24.16b, v16.16b\n"
- "zip1 v20.16b, v20.16b, v19.16b\n"
- "zip1 v21.16b, v21.16b, v18.16b\n"
- "zip1 v19.16b, v23.16b, v17.16b\n"
+ "zip1 v21.16b, v21.16b, v19.16b\n"
+ "zip1 v20.16b, v20.16b, v17.16b\n"
+ "zip1 v19.16b, v23.16b, v18.16b\n"
"zip1 v16.16b, v22.16b, v16.16b\n"
- "zip1 v18.16b, v20.16b, v19.16b\n"
- "zip1 v17.16b, v21.16b, v16.16b\n"
- "zip2 v20.16b, v20.16b, v19.16b\n"
- "zip2 v16.16b, v21.16b, v16.16b\n"
- "zip1 v19.16b, v18.16b, v17.16b\n"
+ "zip1 v18.16b, v21.16b, v20.16b\n"
+ "zip1 v17.16b, v19.16b, v16.16b\n"
+ "zip2 v20.16b, v21.16b, v20.16b\n"
+ "zip2 v19.16b, v19.16b, v16.16b\n"
+ "zip1 v16.16b, v18.16b, v17.16b\n"
"zip2 v18.16b, v18.16b, v17.16b\n"
- "zip1 v17.16b, v20.16b, v16.16b\n"
- "zip2 v16.16b, v20.16b, v16.16b\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "str q17, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
+ "str q16, [x20, #0x0]\n"
+ "zip1 v17.16b, v20.16b, v19.16b\n"
+ "zip2 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [x20, #0x10]\n"
+ "str q17, [x20, #0x20]\n"
+ "str q16, [x20, #0x30]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x16, 7f\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
- "str q16, [x15, #0x10]\n"
- "str q16, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
+ "cmp x21, #0x1\n"
+ "blt 7f\n"
"6:" // Main row loop: width 1 loop: loop
- "ldr b31, [x17], #0x1\n"
- "ldr b30, [x14], #0x1\n"
- "sub x16, x16, #0x1\n"
- "ldr b29, [x13], #0x1\n"
- "ldr b28, [x12], #0x1\n"
- "cmp x16, #0x1\n"
- "ldr b27, [x11], #0x1\n"
- "ldr b26, [x10], #0x1\n"
- "ldr b25, [x9], #0x1\n"
- "ldr b24, [x28], #0x1\n"
- "ldr b23, [x27], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "ldr b21, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b19, [x23], #0x1\n"
- "ldr b18, [x22], #0x1\n"
- "ldr b17, [x21], #0x1\n"
- "ldr b16, [x20], #0x1\n"
- "zip1 v23.16b, v31.16b, v23.16b\n"
- "zip1 v22.16b, v30.16b, v22.16b\n"
- "zip1 v21.16b, v29.16b, v21.16b\n"
- "zip1 v20.16b, v28.16b, v20.16b\n"
- "zip1 v19.16b, v27.16b, v19.16b\n"
- "zip1 v18.16b, v26.16b, v18.16b\n"
+ "ldr b23, [x17], #0x1\n"
+ "ldr b22, [x16], #0x1\n"
+ "sub x21, x21, #0x1\n"
+ "cmp x21, #0x1\n"
+ "ldr b21, [x15], #0x1\n"
+ "ldr b20, [x14], #0x1\n"
+ "ldr b19, [x13], #0x1\n"
+ "ldr b18, [x12], #0x1\n"
+ "ldr b25, [x11], #0x1\n"
+ "ldr b24, [x10], #0x1\n"
+ "ldr b17, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "zip1 v23.16b, v23.16b, v17.16b\n"
+ "zip1 v22.16b, v22.16b, v16.16b\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b16, [x26], #0x1\n"
+ "zip1 v21.16b, v21.16b, v17.16b\n"
+ "zip1 v20.16b, v20.16b, v16.16b\n"
+ "ldr b17, [x25], #0x1\n"
+ "ldr b16, [x24], #0x1\n"
+ "zip1 v19.16b, v19.16b, v17.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "ldr b17, [x23], #0x1\n"
+ "ldr b16, [x22], #0x1\n"
"zip1 v17.16b, v25.16b, v17.16b\n"
"zip1 v16.16b, v24.16b, v16.16b\n"
"zip1 v19.16b, v23.16b, v19.16b\n"
- "zip1 v18.16b, v22.16b, v18.16b\n"
"zip1 v17.16b, v21.16b, v17.16b\n"
+ "zip1 v18.16b, v22.16b, v18.16b\n"
"zip1 v16.16b, v20.16b, v16.16b\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "str q16, [x20, #0x0]\n"
+ "add x20, x20, #0x10\n"
"bge 6b\n"
- "7:" // Main row loop: odd col skip
+ "7:" // Main row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x40\n"
"bge 1b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
index 282375b3a3..94a4b5d07f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
@@ -43,13 +43,11 @@ void a64_transpose_interleave_4_1x4(uint8_t *out, const uint8_t *in, size_t widt
"blt 8f\n"
"1:" // Main row loop: Head
"mov x17, %x[in]\n"
- "mov x16, %x[width]\n"
- "mov x15, %x[out]\n"
- "sub %x[height], %x[height], #0x10\n"
- "add x14, x17, %x[in_stride]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
+ "add x14, x15, %x[in_stride]\n"
"add x13, x14, %x[in_stride]\n"
"add x12, x13, %x[in_stride]\n"
- "cmp x16, #0x10\n"
"add x11, x12, %x[in_stride]\n"
"add x10, x11, %x[in_stride]\n"
"add x9, x10, %x[in_stride]\n"
@@ -57,170 +55,168 @@ void a64_transpose_interleave_4_1x4(uint8_t *out, const uint8_t *in, size_t widt
"add x27, x28, %x[in_stride]\n"
"add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
+ "mov x24, %x[width]\n"
+ "add x23, x25, %x[in_stride]\n"
"add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
+ "cmp x24, #0x10\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x10\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ldr q19, [x17], #0x10\n"
- "ldr q18, [x14], #0x10\n"
- "sub x16, x16, #0x10\n"
- "ldr q17, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
- "cmp x16, #0x10\n"
- "ldr q27, [x11], #0x10\n"
- "ldr q26, [x10], #0x10\n"
- "ldr q25, [x9], #0x10\n"
- "ldr q24, [x28], #0x10\n"
- "ldr q23, [x27], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "zip1 v5.16b, v19.16b, v17.16b\n"
- "zip1 v4.16b, v18.16b, v16.16b\n"
- "ldr q21, [x25], #0x10\n"
- "ldr q20, [x24], #0x10\n"
- "zip2 v3.16b, v19.16b, v17.16b\n"
- "zip2 v2.16b, v18.16b, v16.16b\n"
- "ldr q19, [x23], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v1.16b, v27.16b, v25.16b\n"
- "zip1 v0.16b, v26.16b, v24.16b\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q21, [x17], #0x10\n"
+ "ldr q20, [x16], #0x10\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "ldr q17, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
+ "zip1 v3.16b, v21.16b, v17.16b\n"
+ "zip1 v2.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x13], #0x10\n"
+ "ldr q18, [x12], #0x10\n"
+ "zip2 v1.16b, v21.16b, v17.16b\n"
+ "zip2 v0.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x11], #0x10\n"
+ "ldr q16, [x10], #0x10\n"
+ "zip1 v31.16b, v19.16b, v17.16b\n"
+ "zip1 v30.16b, v18.16b, v16.16b\n"
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "zip2 v29.16b, v19.16b, v17.16b\n"
+ "zip2 v28.16b, v18.16b, v16.16b\n"
+ "ldr q17, [x27], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ "zip1 v23.16b, v21.16b, v17.16b\n"
+ "zip1 v22.16b, v20.16b, v16.16b\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "zip2 v27.16b, v21.16b, v17.16b\n"
+ "zip2 v26.16b, v20.16b, v16.16b\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "zip2 v31.16b, v27.16b, v25.16b\n"
- "zip2 v30.16b, v26.16b, v24.16b\n"
- "zip1 v25.16b, v23.16b, v21.16b\n"
- "zip1 v24.16b, v22.16b, v20.16b\n"
- "zip2 v29.16b, v23.16b, v21.16b\n"
- "zip2 v28.16b, v22.16b, v20.16b\n"
- "zip1 v23.16b, v19.16b, v17.16b\n"
- "zip1 v22.16b, v18.16b, v16.16b\n"
- "zip2 v27.16b, v19.16b, v17.16b\n"
- "zip2 v26.16b, v18.16b, v16.16b\n"
- "zip1 v21.16b, v5.16b, v4.16b\n"
- "zip1 v20.16b, v1.16b, v0.16b\n"
- "zip1 v19.16b, v25.16b, v24.16b\n"
- "zip1 v18.16b, v23.16b, v22.16b\n"
- "zip2 v17.16b, v5.16b, v4.16b\n"
- "zip2 v16.16b, v1.16b, v0.16b\n"
- "str q21, [x15, #0x0]\n"
- "str q20, [x15, #0x10]\n"
- "zip2 v25.16b, v25.16b, v24.16b\n"
- "zip2 v24.16b, v23.16b, v22.16b\n"
- "str q19, [x15, #0x20]\n"
- "zip1 v23.16b, v3.16b, v2.16b\n"
- "zip1 v22.16b, v31.16b, v30.16b\n"
- "str q18, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
- "zip1 v21.16b, v29.16b, v28.16b\n"
- "zip1 v20.16b, v27.16b, v26.16b\n"
- "str q17, [x15, #0x0]\n"
+ "zip1 v21.16b, v19.16b, v17.16b\n"
+ "zip1 v20.16b, v18.16b, v16.16b\n"
+ "zip2 v25.16b, v19.16b, v17.16b\n"
+ "zip2 v24.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v3.16b, v2.16b\n"
+ "zip1 v18.16b, v31.16b, v30.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "zip1 v17.16b, v23.16b, v22.16b\n"
+ "zip1 v16.16b, v21.16b, v20.16b\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
"zip2 v19.16b, v3.16b, v2.16b\n"
"zip2 v18.16b, v31.16b, v30.16b\n"
- "str q16, [x15, #0x10]\n"
- "zip2 v17.16b, v29.16b, v28.16b\n"
- "zip2 v16.16b, v27.16b, v26.16b\n"
- "str q25, [x15, #0x20]\n"
- "str q24, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
- "str q23, [x15, #0x0]\n"
- "str q22, [x15, #0x10]\n"
- "str q21, [x15, #0x20]\n"
- "str q20, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
- "str q19, [x15, #0x0]\n"
- "str q18, [x15, #0x10]\n"
- "str q17, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 v17.16b, v23.16b, v22.16b\n"
+ "zip2 v16.16b, v21.16b, v20.16b\n"
+ "str q19, [x21, #0x0]\n"
+ "zip1 v23.16b, v1.16b, v0.16b\n"
+ "zip1 v22.16b, v29.16b, v28.16b\n"
+ "str q18, [x21, #0x10]\n"
+ "zip1 v21.16b, v27.16b, v26.16b\n"
+ "zip1 v20.16b, v25.16b, v24.16b\n"
+ "str q17, [x21, #0x20]\n"
+ "zip2 v19.16b, v1.16b, v0.16b\n"
+ "zip2 v18.16b, v29.16b, v28.16b\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 v17.16b, v27.16b, v26.16b\n"
+ "zip2 v16.16b, v25.16b, v24.16b\n"
+ "str q23, [x21, #0x0]\n"
+ "str q22, [x21, #0x10]\n"
+ "str q21, [x21, #0x20]\n"
+ "str q20, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cmp x16, #0x4\n"
+ "cmp x24, #0x4\n"
"blt 5f\n"
"4:" // Main row loop: Column loop
- "ldr s23, [x17], #0x4\n"
- "ldr s21, [x14], #0x4\n"
- "sub x16, x16, #0x4\n"
- "ldr s20, [x13], #0x4\n"
- "ldr s17, [x12], #0x4\n"
- "cmp x16, #0x4\n"
- "ldr s22, [x11], #0x4\n"
- "ldr s19, [x10], #0x4\n"
- "ldr s18, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s27, [x27], #0x4\n"
- "ldr s26, [x26], #0x4\n"
- "zip1 v25.16b, v23.16b, v20.16b\n"
- "zip1 v21.16b, v21.16b, v17.16b\n"
+ "ldr s19, [x17], #0x4\n"
+ "ldr s18, [x16], #0x4\n"
+ "sub x24, x24, #0x4\n"
+ "cmp x24, #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr s16, [x14], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr s19, [x13], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
+ "zip1 v22.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x11], #0x4\n"
+ "ldr s16, [x10], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "zip1 v21.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s16, [x26], #0x4\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
"ldr s20, [x25], #0x4\n"
- "ldr s17, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s23, [x22], #0x4\n"
- "zip1 v22.16b, v22.16b, v18.16b\n"
- "zip1 v19.16b, v19.16b, v16.16b\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr s17, [x22], #0x4\n"
"ldr s16, [x20], #0x4\n"
- "zip1 v21.16b, v25.16b, v21.16b\n"
- "zip1 v20.16b, v27.16b, v20.16b\n"
- "zip1 v17.16b, v26.16b, v17.16b\n"
- "zip1 v19.16b, v22.16b, v19.16b\n"
- "zip1 v18.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "str q21, [x15, #0x0]\n"
"zip1 v17.16b, v20.16b, v17.16b\n"
- "str q19, [x15, #0x10]\n"
- "zip1 v16.16b, v18.16b, v16.16b\n"
- "str q17, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
- "add x15, x15, %x[out_stride]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str q22, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q18, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 4b\n"
"5:" // Main row loop: Column loop skip
- "cbz x16, 7f\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
- "str q16, [x15, #0x10]\n"
- "str q16, [x15, #0x20]\n"
- "str q16, [x15, #0x30]\n"
+ "cmp x24, #0x1\n"
+ "blt 7f\n"
"6:" // Main row loop: width 1 loop: loop
- "ldr b23, [x17], #0x1\n"
- "ldr b21, [x14], #0x1\n"
- "sub x16, x16, #0x1\n"
- "ldr b20, [x13], #0x1\n"
- "ldr b19, [x12], #0x1\n"
- "cmp x16, #0x1\n"
- "ldr b22, [x11], #0x1\n"
- "ldr b18, [x10], #0x1\n"
- "ldr b17, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b27, [x27], #0x1\n"
- "ldr b26, [x26], #0x1\n"
- "zip1 v25.16b, v23.16b, v20.16b\n"
- "zip1 v21.16b, v21.16b, v19.16b\n"
+ "ldr b19, [x17], #0x1\n"
+ "ldr b18, [x16], #0x1\n"
+ "sub x24, x24, #0x1\n"
+ "cmp x24, #0x1\n"
+ "ldr b17, [x15], #0x1\n"
+ "ldr b16, [x14], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr b19, [x13], #0x1\n"
+ "ldr b18, [x12], #0x1\n"
+ "zip1 v22.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x11], #0x1\n"
+ "ldr b16, [x10], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
+ "ldr b19, [x9], #0x1\n"
+ "ldr b18, [x28], #0x1\n"
+ "zip1 v21.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b16, [x26], #0x1\n"
+ "zip1 v17.16b, v19.16b, v17.16b\n"
+ "zip1 v16.16b, v18.16b, v16.16b\n"
"ldr b20, [x25], #0x1\n"
- "ldr b19, [x24], #0x1\n"
- "ldr b24, [x23], #0x1\n"
- "ldr b23, [x22], #0x1\n"
- "zip1 v22.16b, v22.16b, v17.16b\n"
- "zip1 v17.16b, v18.16b, v16.16b\n"
- "ldr b18, [x21], #0x1\n"
+ "ldr b19, [x23], #0x1\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "ldr b17, [x22], #0x1\n"
"ldr b16, [x20], #0x1\n"
- "zip1 v21.16b, v25.16b, v21.16b\n"
- "zip1 v20.16b, v27.16b, v20.16b\n"
- "zip1 v19.16b, v26.16b, v19.16b\n"
- "zip1 v17.16b, v22.16b, v17.16b\n"
- "zip1 v18.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v23.16b, v16.16b\n"
- "str s21, [x15, #0x0]\n"
- "str s17, [x15, #0x10]\n"
- "zip1 v17.16b, v20.16b, v19.16b\n"
- "zip1 v16.16b, v18.16b, v16.16b\n"
- "str s17, [x15, #0x20]\n"
- "str s16, [x15, #0x30]\n"
- "add x15, x15, #0x4\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "str s22, [x21, #0x0]\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ "str s21, [x21, #0x10]\n"
+ "str s18, [x21, #0x20]\n"
+ "str s16, [x21, #0x30]\n"
+ "add x21, x21, #0x4\n"
"bge 6b\n"
- "7:" // Main row loop: odd col skip
+ "7:" // Main row loop: width 1 loop: skip
"cmp %x[height], #0x10\n"
"add %x[out], %x[out], #0x40\n"
"bge 1b\n"
@@ -228,85 +224,84 @@ void a64_transpose_interleave_4_1x4(uint8_t *out, const uint8_t *in, size_t widt
"8:" // Main loop skip
"9:" // Tail row loop: Head
"mov x17, %x[in]\n"
+ "add x16, x17, %x[in_stride]\n"
+ "add x15, x16, %x[in_stride]\n"
"mov x20, %x[width]\n"
+ "add x14, x15, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x15, %x[out]\n"
- "add x14, x17, %x[in_stride]\n"
- "add x13, x14, %x[in_stride]\n"
- "add x12, x13, %x[in_stride]\n"
- "csel x13, x13, %x[pad_row], GE\n"
- "add %x[in], x12, %x[in_stride]\n"
- "csel x12, x12, %x[pad_row], GT\n"
- "cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add %x[in], x14, %x[in_stride]\n"
"csel x14, x14, %x[pad_row], GT\n"
+ "csel x15, x15, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x16, x16, %x[pad_row], GT\n"
"cmp x20, #0x10\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 11f\n"
"10:" // Tail row loop: Unroll column loop
- "ldr q20, [x17], #0x10\n"
- "ldr q21, [x14], #0x10\n"
+ "ldr q19, [x17], #0x10\n"
+ "ldr q21, [x16], #0x10\n"
"sub x20, x20, #0x10\n"
- "ldr q19, [x13], #0x10\n"
- "ldr q16, [x12], #0x10\n"
"cmp x20, #0x10\n"
- "zip1 v18.16b, v20.16b, v19.16b\n"
+ "ldr q18, [x15], #0x10\n"
+ "ldr q16, [x14], #0x10\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
"zip1 v17.16b, v21.16b, v16.16b\n"
- "zip2 v20.16b, v20.16b, v19.16b\n"
- "zip2 v19.16b, v21.16b, v16.16b\n"
- "zip1 v16.16b, v18.16b, v17.16b\n"
- "zip2 v18.16b, v18.16b, v17.16b\n"
- "str q16, [x15, #0x0]\n"
- "add x15, x15, %x[out_stride]\n"
- "zip1 v17.16b, v20.16b, v19.16b\n"
- "zip2 v16.16b, v20.16b, v19.16b\n"
- "str q18, [x15, #0x0]\n"
- "add x15, x15, %x[out_stride]\n"
- "str q17, [x15, #0x0]\n"
- "add x15, x15, %x[out_stride]\n"
- "str q16, [x15, #0x0]\n"
- "add x15, x15, %x[out_stride]\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "zip2 v18.16b, v21.16b, v16.16b\n"
+ "zip1 v16.16b, v20.16b, v17.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 v16.16b, v20.16b, v17.16b\n"
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip1 v17.16b, v19.16b, v18.16b\n"
+ "zip2 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [x21, #0x0]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 10b\n"
"11:" // Tail row loop: Unroll column loop skip
"cmp x20, #0x4\n"
"blt 13f\n"
"12:" // Tail row loop: Column loop
"ldr s19, [x17], #0x4\n"
- "ldr s18, [x14], #0x4\n"
+ "ldr s18, [x16], #0x4\n"
"sub x20, x20, #0x4\n"
- "ldr s17, [x13], #0x4\n"
- "ldr s16, [x12], #0x4\n"
"cmp x20, #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr s16, [x14], #0x4\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [x15, #0x0]\n"
- "add x15, x15, %x[out_stride]\n"
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 12b\n"
"13:" // Tail row loop: Column loop skip
- "cbz x20, 15f\n"
- "movi v16.16b, #0x0\n"
- "str q16, [x15, #0x0]\n"
+ "cmp x20, #0x1\n"
+ "blt 15f\n"
"14:" // Tail row loop: width 1 loop: loop
"ldr b19, [x17], #0x1\n"
- "ldr b18, [x14], #0x1\n"
+ "ldr b18, [x16], #0x1\n"
"sub x20, x20, #0x1\n"
- "ldr b17, [x13], #0x1\n"
- "ldr b16, [x12], #0x1\n"
"cmp x20, #0x1\n"
+ "ldr b17, [x15], #0x1\n"
+ "ldr b16, [x14], #0x1\n"
"zip1 v17.16b, v19.16b, v17.16b\n"
"zip1 v16.16b, v18.16b, v16.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- "str s16, [x15, #0x0]\n"
- "add x15, x15, #0x4\n"
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
"bge 14b\n"
- "15:" // Tail row loop: odd col skip
+ "15:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x10\n"
"bge 9b\n"
"16:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
index 387c6adabd..03b134e422 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
@@ -34,201 +34,175 @@ void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x20\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
"ldr q31, [x25], #0x10\n"
- "ldr q30, [x22], #0x10\n"
+ "ldr q30, [x23], #0x10\n"
"sub x24, x24, #0x20\n"
- "ldr q29, [x21], #0x10\n"
- "ldr q28, [x20], #0x10\n"
"cmp x24, #0x20\n"
+ "ldr q29, [x22], #0x10\n"
+ "ldr q28, [x20], #0x10\n"
"ldr q27, [x25], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
+ "ldr q26, [x23], #0x10\n"
+ "ldr q25, [x22], #0x10\n"
"ldr q24, [x20], #0x10\n"
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q31, [x23, #0x0]\n"
- "str q27, [x23, #0x10]\n"
- "str q23, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "str q30, [x23, #0x40]\n"
- "str q26, [x23, #0x50]\n"
- "str q22, [x23, #0x60]\n"
- "str q18, [x23, #0x70]\n"
- "str q29, [x23, #0x80]\n"
- "str q25, [x23, #0x90]\n"
- "str q21, [x23, #0xa0]\n"
- "str q17, [x23, #0xb0]\n"
- "str q28, [x23, #0xc0]\n"
- "str q24, [x23, #0xd0]\n"
- "str q20, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
- "add x23, x23, %x[out_stride]\n"
+ "str q31, [x21, #0x0]\n"
+ "str q27, [x21, #0x10]\n"
+ "str q23, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q30, [x21, #0x40]\n"
+ "str q26, [x21, #0x50]\n"
+ "str q22, [x21, #0x60]\n"
+ "str q18, [x21, #0x70]\n"
+ "str q29, [x21, #0x80]\n"
+ "str q25, [x21, #0x90]\n"
+ "str q21, [x21, #0xa0]\n"
+ "str q17, [x21, #0xb0]\n"
+ "str q28, [x21, #0xc0]\n"
+ "str q24, [x21, #0xd0]\n"
+ "str q20, [x21, #0xe0]\n"
+ "str q16, [x21, #0xf0]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x10\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "str q16, [x23, #0xc0]\n"
- "str q16, [x23, #0xd0]\n"
- "str q16, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
"blt 5f\n"
"4:" // Main row loop: width 16 loop: loop
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
"sub x24, x24, #0x10\n"
- "ldr q21, [x21], #0x10\n"
- "ldr q20, [x20], #0x10\n"
"cmp x24, #0x10\n"
+ "ldr q21, [x22], #0x10\n"
+ "ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q23, [x23, #0x0]\n"
- "str q19, [x23, #0x10]\n"
- "str q22, [x23, #0x40]\n"
- "str q18, [x23, #0x50]\n"
- "str q21, [x23, #0x80]\n"
- "str q17, [x23, #0x90]\n"
- "str q20, [x23, #0xc0]\n"
- "str q16, [x23, #0xd0]\n"
- "add x23, x23, #0x20\n"
+ "str q23, [x21, #0x0]\n"
+ "str q19, [x21, #0x10]\n"
+ "str q22, [x21, #0x40]\n"
+ "str q18, [x21, #0x50]\n"
+ "str q21, [x21, #0x80]\n"
+ "str q17, [x21, #0x90]\n"
+ "str q20, [x21, #0xc0]\n"
+ "str q16, [x21, #0xd0]\n"
+ "add x21, x21, #0x20\n"
"bge 4b\n"
"5:" // Main row loop: width 16 loop: skip
"cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr d19, [x25], #0x8\n"
- "ldr d18, [x22], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
"sub x24, x24, #0x4\n"
- "ldr d17, [x21], #0x8\n"
- "ldr d16, [x20], #0x8\n"
"cmp x24, #0x4\n"
- "str d19, [x23, #0x0]\n"
- "str d18, [x23, #0x40]\n"
- "str d17, [x23, #0x80]\n"
- "str d16, [x23, #0xc0]\n"
- "add x23, x23, #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "str d19, [x21, #0x0]\n"
+ "str d18, [x21, #0x40]\n"
+ "str d17, [x21, #0x80]\n"
+ "str d16, [x21, #0xc0]\n"
+ "add x21, x21, #0x8\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr h19, [x25], #0x2\n"
- "ldr h18, [x22], #0x2\n"
+ "ldr h18, [x23], #0x2\n"
"sub x24, x24, #0x1\n"
- "ldr h17, [x21], #0x2\n"
- "ldr h16, [x20], #0x2\n"
"cmp x24, #0x1\n"
- "str h19, [x23, #0x0]\n"
- "str h18, [x23, #0x40]\n"
- "str h17, [x23, #0x80]\n"
- "str h16, [x23, #0xc0]\n"
- "add x23, x23, #0x2\n"
+ "ldr h17, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "str h19, [x21, #0x0]\n"
+ "str h18, [x21, #0x40]\n"
+ "str h17, [x21, #0x80]\n"
+ "str h16, [x21, #0xc0]\n"
+ "add x21, x21, #0x2\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x100\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x20\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
"ldr q19, [x25], #0x10\n"
- "sub x20, x20, #0x20\n"
"ldr q18, [x25], #0x10\n"
- "ldr q17, [x25], #0x10\n"
+ "sub x20, x20, #0x20\n"
"cmp x20, #0x20\n"
+ "ldr q17, [x25], #0x10\n"
"ldr q16, [x25], #0x10\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.8h, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 16 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 16 loop: loop
"ldr q17, [x25], #0x10\n"
- "sub x20, x20, #0x10\n"
"ldr q16, [x25], #0x10\n"
+ "sub x20, x20, #0x10\n"
"cmp x20, #0x10\n"
- "str q17, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "add x23, x23, #0x20\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 16 loop: skip
+ "str q17, [x21, #0x0]\n"
+ "str q16, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr d16, [x25], #0x8\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
- "str d16, [x23, #0x0]\n"
- "add x23, x23, #0x8\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str d16, [x21, #0x0]\n"
+ "add x21, x21, #0x8\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr h16, [x25], #0x2\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
- "str h16, [x23, #0x0]\n"
- "add x23, x23, #0x2\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str h16, [x21, #0x0]\n"
+ "add x21, x21, #0x2\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x40\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
index e5778860a4..2719d24750 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
@@ -34,251 +34,215 @@ void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "blt 11f\n"
+ "blt 10f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
"mov x24, %x[width]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x25, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
+ "add x23, x25, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x20, x22, %x[in_stride]\n"
"cmp x24, #0x18\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Column loop
"ldr q7, [x25], #0x10\n"
- "ldr q6, [x22], #0x10\n"
+ "ldr q6, [x23], #0x10\n"
"sub x24, x24, #0x18\n"
- "ldr q5, [x21], #0x10\n"
- "ldr q4, [x20], #0x10\n"
"cmp x24, #0x18\n"
+ "ldr q5, [x22], #0x10\n"
+ "ldr q4, [x20], #0x10\n"
"ldr q3, [x25], #0x10\n"
- "ldr q2, [x22], #0x10\n"
- "ldr q1, [x21], #0x10\n"
+ "ldr q2, [x23], #0x10\n"
+ "ldr q1, [x22], #0x10\n"
"ldr q0, [x20], #0x10\n"
"ldr q31, [x25], #0x10\n"
- "ldr q30, [x22], #0x10\n"
- "ldr q29, [x21], #0x10\n"
+ "ldr q30, [x23], #0x10\n"
+ "ldr q29, [x22], #0x10\n"
"ldr q28, [x20], #0x10\n"
"ldr q27, [x25], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
+ "ldr q26, [x23], #0x10\n"
+ "ldr q25, [x22], #0x10\n"
"ldr q24, [x20], #0x10\n"
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q7, [x23, #0x0]\n"
- "str q3, [x23, #0x10]\n"
- "str q31, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q23, [x23, #0x40]\n"
- "str q19, [x23, #0x50]\n"
- "str q6, [x23, #0x60]\n"
- "str q2, [x23, #0x70]\n"
- "str q30, [x23, #0x80]\n"
- "str q26, [x23, #0x90]\n"
- "str q22, [x23, #0xa0]\n"
- "str q18, [x23, #0xb0]\n"
- "str q5, [x23, #0xc0]\n"
- "str q1, [x23, #0xd0]\n"
- "str q29, [x23, #0xe0]\n"
- "str q25, [x23, #0xf0]\n"
- "str q21, [x23, #0x100]\n"
- "str q17, [x23, #0x110]\n"
- "str q4, [x23, #0x120]\n"
- "str q0, [x23, #0x130]\n"
- "str q28, [x23, #0x140]\n"
- "str q24, [x23, #0x150]\n"
- "str q20, [x23, #0x160]\n"
- "str q16, [x23, #0x170]\n"
- "add x23, x23, %x[out_stride]\n"
+ "str q7, [x21, #0x0]\n"
+ "str q3, [x21, #0x10]\n"
+ "str q31, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "str q23, [x21, #0x40]\n"
+ "str q19, [x21, #0x50]\n"
+ "str q6, [x21, #0x60]\n"
+ "str q2, [x21, #0x70]\n"
+ "str q30, [x21, #0x80]\n"
+ "str q26, [x21, #0x90]\n"
+ "str q22, [x21, #0xa0]\n"
+ "str q18, [x21, #0xb0]\n"
+ "str q5, [x21, #0xc0]\n"
+ "str q1, [x21, #0xd0]\n"
+ "str q29, [x21, #0xe0]\n"
+ "str q25, [x21, #0xf0]\n"
+ "str q21, [x21, #0x100]\n"
+ "str q17, [x21, #0x110]\n"
+ "str q4, [x21, #0x120]\n"
+ "str q0, [x21, #0x130]\n"
+ "str q28, [x21, #0x140]\n"
+ "str q24, [x21, #0x150]\n"
+ "str q20, [x21, #0x160]\n"
+ "str q16, [x21, #0x170]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Column loop skip
- "cbz x24, 10f\n"
"cmp x24, #0x10\n"
- "movi v16.4s, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "str q16, [x23, #0x60]\n"
- "str q16, [x23, #0x70]\n"
- "str q16, [x23, #0x80]\n"
- "str q16, [x23, #0x90]\n"
- "str q16, [x23, #0xa0]\n"
- "str q16, [x23, #0xb0]\n"
- "str q16, [x23, #0xc0]\n"
- "str q16, [x23, #0xd0]\n"
- "str q16, [x23, #0xe0]\n"
- "str q16, [x23, #0xf0]\n"
- "str q16, [x23, #0x100]\n"
- "str q16, [x23, #0x110]\n"
- "str q16, [x23, #0x120]\n"
- "str q16, [x23, #0x130]\n"
- "str q16, [x23, #0x140]\n"
- "str q16, [x23, #0x150]\n"
- "str q16, [x23, #0x160]\n"
- "str q16, [x23, #0x170]\n"
"blt 5f\n"
"4:" // Main row loop: width 16 loop: loop
"ldr q31, [x25], #0x10\n"
- "ldr q30, [x22], #0x10\n"
+ "ldr q30, [x23], #0x10\n"
"sub x24, x24, #0x10\n"
- "ldr q29, [x21], #0x10\n"
- "ldr q28, [x20], #0x10\n"
"cmp x24, #0x10\n"
+ "ldr q29, [x22], #0x10\n"
+ "ldr q28, [x20], #0x10\n"
"ldr q27, [x25], #0x10\n"
- "ldr q26, [x22], #0x10\n"
- "ldr q25, [x21], #0x10\n"
+ "ldr q26, [x23], #0x10\n"
+ "ldr q25, [x22], #0x10\n"
"ldr q24, [x20], #0x10\n"
"ldr q23, [x25], #0x10\n"
- "ldr q22, [x22], #0x10\n"
- "ldr q21, [x21], #0x10\n"
+ "ldr q22, [x23], #0x10\n"
+ "ldr q21, [x22], #0x10\n"
"ldr q20, [x20], #0x10\n"
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
- "ldr q17, [x21], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
+ "ldr q17, [x22], #0x10\n"
"ldr q16, [x20], #0x10\n"
- "str q31, [x23, #0x0]\n"
- "str q27, [x23, #0x10]\n"
- "str q23, [x23, #0x20]\n"
- "str q19, [x23, #0x30]\n"
- "str q30, [x23, #0x60]\n"
- "str q26, [x23, #0x70]\n"
- "str q22, [x23, #0x80]\n"
- "str q18, [x23, #0x90]\n"
- "str q29, [x23, #0xc0]\n"
- "str q25, [x23, #0xd0]\n"
- "str q21, [x23, #0xe0]\n"
- "str q17, [x23, #0xf0]\n"
- "str q28, [x23, #0x120]\n"
- "str q24, [x23, #0x130]\n"
- "str q20, [x23, #0x140]\n"
- "str q16, [x23, #0x150]\n"
- "add x23, x23, #0x40\n"
+ "str q31, [x21, #0x0]\n"
+ "str q27, [x21, #0x10]\n"
+ "str q23, [x21, #0x20]\n"
+ "str q19, [x21, #0x30]\n"
+ "str q30, [x21, #0x60]\n"
+ "str q26, [x21, #0x70]\n"
+ "str q22, [x21, #0x80]\n"
+ "str q18, [x21, #0x90]\n"
+ "str q29, [x21, #0xc0]\n"
+ "str q25, [x21, #0xd0]\n"
+ "str q21, [x21, #0xe0]\n"
+ "str q17, [x21, #0xf0]\n"
+ "str q28, [x21, #0x120]\n"
+ "str q24, [x21, #0x130]\n"
+ "str q20, [x21, #0x140]\n"
+ "str q16, [x21, #0x150]\n"
+ "add x21, x21, #0x40\n"
"bge 4b\n"
"5:" // Main row loop: width 16 loop: skip
"cmp x24, #0x4\n"
"blt 7f\n"
"6:" // Main row loop: width 4 loop: loop
"ldr q19, [x25], #0x10\n"
- "ldr q18, [x22], #0x10\n"
+ "ldr q18, [x23], #0x10\n"
"sub x24, x24, #0x4\n"
- "ldr q17, [x21], #0x10\n"
- "ldr q16, [x20], #0x10\n"
"cmp x24, #0x4\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x60]\n"
- "str q17, [x23, #0xc0]\n"
- "str q16, [x23, #0x120]\n"
- "add x23, x23, #0x10\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x60]\n"
+ "str q17, [x21, #0xc0]\n"
+ "str q16, [x21, #0x120]\n"
+ "add x21, x21, #0x10\n"
"bge 6b\n"
"7:" // Main row loop: width 4 loop: skip
"cmp x24, #0x1\n"
"blt 9f\n"
"8:" // Main row loop: width 1 loop: loop
"ldr s19, [x25], #0x4\n"
- "ldr s18, [x22], #0x4\n"
+ "ldr s18, [x23], #0x4\n"
"sub x24, x24, #0x1\n"
- "ldr s17, [x21], #0x4\n"
- "ldr s16, [x20], #0x4\n"
"cmp x24, #0x1\n"
- "str s19, [x23, #0x0]\n"
- "str s18, [x23, #0x60]\n"
- "str s17, [x23, #0xc0]\n"
- "str s16, [x23, #0x120]\n"
- "add x23, x23, #0x4\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "str s19, [x21, #0x0]\n"
+ "str s18, [x21, #0x60]\n"
+ "str s17, [x21, #0xc0]\n"
+ "str s16, [x21, #0x120]\n"
+ "add x21, x21, #0x4\n"
"bge 8b\n"
"9:" // Main row loop: width 1 loop: skip
- "10:" // Main row loop: odd col skip
"cmp %x[height], #0x4\n"
"add %x[out], %x[out], #0x180\n"
"bge 1b\n"
- "cbz %x[height], 22f\n"
- "11:" // Main loop skip
- "12:" // Tail row loop: Head
+ "cbz %x[height], 20f\n"
+ "10:" // Main loop skip
+ "11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x20, #0x18\n"
"add %x[in], x25, %x[in_stride]\n"
- "blt 14f\n"
- "13:" // Tail row loop: Column loop
+ "mov x21, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
"ldr q21, [x25], #0x10\n"
- "sub x20, x20, #0x18\n"
"ldr q20, [x25], #0x10\n"
- "ldr q19, [x25], #0x10\n"
+ "sub x20, x20, #0x18\n"
"cmp x20, #0x18\n"
+ "ldr q19, [x25], #0x10\n"
"ldr q18, [x25], #0x10\n"
"ldr q17, [x25], #0x10\n"
"ldr q16, [x25], #0x10\n"
- "str q21, [x23, #0x0]\n"
- "str q20, [x23, #0x10]\n"
- "str q19, [x23, #0x20]\n"
- "str q18, [x23, #0x30]\n"
- "str q17, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "add x23, x23, %x[out_stride]\n"
- "bge 13b\n"
- "14:" // Tail row loop: Column loop skip
- "cbz x20, 21f\n"
+ "str q21, [x21, #0x0]\n"
+ "str q20, [x21, #0x10]\n"
+ "str q19, [x21, #0x20]\n"
+ "str q18, [x21, #0x30]\n"
+ "str q17, [x21, #0x40]\n"
+ "str q16, [x21, #0x50]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
"cmp x20, #0x10\n"
- "movi v16.4s, #0x0\n"
- "str q16, [x23, #0x0]\n"
- "str q16, [x23, #0x10]\n"
- "str q16, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "str q16, [x23, #0x40]\n"
- "str q16, [x23, #0x50]\n"
- "blt 16f\n"
- "15:" // Tail row loop: width 16 loop: loop
+ "blt 15f\n"
+ "14:" // Tail row loop: width 16 loop: loop
"ldr q19, [x25], #0x10\n"
- "sub x20, x20, #0x10\n"
"ldr q18, [x25], #0x10\n"
- "ldr q17, [x25], #0x10\n"
+ "sub x20, x20, #0x10\n"
"cmp x20, #0x10\n"
+ "ldr q17, [x25], #0x10\n"
"ldr q16, [x25], #0x10\n"
- "str q19, [x23, #0x0]\n"
- "str q18, [x23, #0x10]\n"
- "str q17, [x23, #0x20]\n"
- "str q16, [x23, #0x30]\n"
- "add x23, x23, #0x40\n"
- "bge 15b\n"
- "16:" // Tail row loop: width 16 loop: skip
+ "str q19, [x21, #0x0]\n"
+ "str q18, [x21, #0x10]\n"
+ "str q17, [x21, #0x20]\n"
+ "str q16, [x21, #0x30]\n"
+ "add x21, x21, #0x40\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: width 16 loop: skip
"cmp x20, #0x4\n"
- "blt 18f\n"
- "17:" // Tail row loop: width 4 loop: loop
+ "blt 17f\n"
+ "16:" // Tail row loop: width 4 loop: loop
"ldr q16, [x25], #0x10\n"
"sub x20, x20, #0x4\n"
"cmp x20, #0x4\n"
- "str q16, [x23, #0x0]\n"
- "add x23, x23, #0x10\n"
- "bge 17b\n"
- "18:" // Tail row loop: width 4 loop: skip
+ "str q16, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "bge 16b\n"
+ "17:" // Tail row loop: width 4 loop: skip
"cmp x20, #0x1\n"
- "blt 20f\n"
- "19:" // Tail row loop: width 1 loop: loop
+ "blt 19f\n"
+ "18:" // Tail row loop: width 1 loop: loop
"ldr s16, [x25], #0x4\n"
"sub x20, x20, #0x1\n"
"cmp x20, #0x1\n"
- "str s16, [x23, #0x0]\n"
- "add x23, x23, #0x4\n"
- "bge 19b\n"
- "20:" // Tail row loop: width 1 loop: skip
- "21:" // Tail row loop: odd col skip
+ "str s16, [x21, #0x0]\n"
+ "add x21, x21, #0x4\n"
+ "bge 18b\n"
+ "19:" // Tail row loop: width 1 loop: skip
"cmp %x[height], #0x1\n"
"add %x[out], %x[out], #0x60\n"
- "bge 12b\n"
- "22:" // Done
+ "bge 11b\n"
+ "20:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
index 4c7b367ed9..768719b0de 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
@@ -28,7 +28,7 @@
namespace {
-void sme_transpose_interleave_16VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+void sme_transpose_interleave_16VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
{
size_t out_stride = 16 * height * sme::get_vector_length<uint8_t>();
@@ -36,82 +36,82 @@ void sme_transpose_interleave_16VL(uint16_t *out, const uint16_t *in, size_t wid
".inst 0xd503477f // SMSTART ZA\n"
"ptrue p7.b\n"
"1:" // Main row loop: Head
- "mov x24, %x[in]\n"
- "mov x23, %x[out]\n"
- "add %x[in], x24, %x[in_stride]\n"
+ "mov x23, %x[in]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x1\n"
- "mov x22, %x[width]\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x21, x22\n"
- "mov x20, x23\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z31.h }, p0/Z, [x24]\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z30.h }, p1/Z, [x24, #1, MUL VL]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z29.h }, p0/Z, [x24, #2, MUL VL]\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z28.h }, p1/Z, [x24, #3, MUL VL]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z27.h }, p0/Z, [x24, #4, MUL VL]\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z26.h }, p1/Z, [x24, #5, MUL VL]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z25.h }, p0/Z, [x24, #6, MUL VL]\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z24.h }, p1/Z, [x24, #7, MUL VL]\n"
- "whilelt p6.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p5.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p4.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p3.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p2.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "addvl x24, x24, #16\n"
- "dech x22, ALL, MUL #16\n"
- "ld1h { z23.h }, p0/Z, [x24, #-8, MUL VL]\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x22, #0x0\n"
- "ld1h { z22.h }, p6/Z, [x24, #-7, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
- "ld1h { z21.h }, p5/Z, [x24, #-6, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x24, #-5, MUL VL]\n"
- "ld1h { z19.h }, p3/Z, [x24, #-4, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #-3, MUL VL]\n"
- "ld1h { z17.h }, p1/Z, [x24, #-2, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x24, #-1, MUL VL]\n"
- "st1h { z31.h }, p7, [x20]\n"
- "st1h { z30.h }, p7, [x20, #1, MUL VL]\n"
- "st1h { z29.h }, p7, [x20, #2, MUL VL]\n"
- "st1h { z28.h }, p7, [x20, #3, MUL VL]\n"
- "st1h { z27.h }, p7, [x20, #4, MUL VL]\n"
- "st1h { z26.h }, p7, [x20, #5, MUL VL]\n"
- "st1h { z25.h }, p7, [x20, #6, MUL VL]\n"
- "st1h { z24.h }, p7, [x20, #7, MUL VL]\n"
+ "mov x20, x21\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z31.s }, p0/Z, [x23]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z30.s }, p0/Z, [x23, #1, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z29.s }, p0/Z, [x23, #2, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z28.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z27.s }, p0/Z, [x23, #4, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z26.s }, p0/Z, [x23, #5, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z25.s }, p0/Z, [x23, #6, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z24.s }, p0/Z, [x23, #7, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "decw x20\n"
+ "whilelt p6.s, XZR, x20\n"
+ "decw x20\n"
+ "whilelt p5.s, XZR, x20\n"
+ "decw x20\n"
+ "whilelt p4.s, XZR, x20\n"
+ "decw x20\n"
+ "whilelt p3.s, XZR, x20\n"
+ "decw x20\n"
+ "whilelt p2.s, XZR, x20\n"
+ "decw x20\n"
+ "whilelt p1.s, XZR, x20\n"
+ "decw x20\n"
+ "addvl x23, x23, #16\n"
+ "ld1w { z23.s }, p0/Z, [x23, #-8, MUL VL]\n"
+ "whilelt p0.s, XZR, x20\n"
+ "mov x20, x22\n"
+ "ld1w { z22.s }, p6/Z, [x23, #-7, MUL VL]\n"
+ "decw x21, ALL, MUL #16\n"
+ "ld1w { z21.s }, p5/Z, [x23, #-6, MUL VL]\n"
+ "cmp x21, #0x0\n"
+ "ld1w { z20.s }, p4/Z, [x23, #-5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1w { z19.s }, p3/Z, [x23, #-4, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #-3, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x23, #-1, MUL VL]\n"
+ "st1w { z31.s }, p7, [x20]\n"
+ "st1w { z30.s }, p7, [x20, #1, MUL VL]\n"
+ "st1w { z29.s }, p7, [x20, #2, MUL VL]\n"
+ "st1w { z28.s }, p7, [x20, #3, MUL VL]\n"
+ "st1w { z27.s }, p7, [x20, #4, MUL VL]\n"
+ "st1w { z26.s }, p7, [x20, #5, MUL VL]\n"
+ "st1w { z25.s }, p7, [x20, #6, MUL VL]\n"
+ "st1w { z24.s }, p7, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1h { z23.h }, p7, [x20, #-8, MUL VL]\n"
- "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n"
- "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n"
- "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n"
- "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n"
- "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n"
- "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n"
- "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n"
+ "st1w { z23.s }, p7, [x20, #-8, MUL VL]\n"
+ "st1w { z22.s }, p7, [x20, #-7, MUL VL]\n"
+ "st1w { z21.s }, p7, [x20, #-6, MUL VL]\n"
+ "st1w { z20.s }, p7, [x20, #-5, MUL VL]\n"
+ "st1w { z19.s }, p7, [x20, #-4, MUL VL]\n"
+ "st1w { z18.s }, p7, [x20, #-3, MUL VL]\n"
+ "st1w { z17.s }, p7, [x20, #-2, MUL VL]\n"
+ "st1w { z16.s }, p7, [x20, #-1, MUL VL]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -120,7 +120,7 @@ void sme_transpose_interleave_16VL(uint16_t *out, const uint16_t *in, size_t wid
".inst 0xd503467f // SMSTOP\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
@@ -131,26 +131,13 @@ void Transform<16, 1, true, VLType::SME>(
float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
{
sme_transpose_interleave_16VL(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
- (xmax-x0) * sizeof(float) / 2,
+ reinterpret_cast<uint32_t *>(out),
+ reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(float) / 4,
stride * sizeof(float),
(kmax-k0)
);
}
-template<>
-void Transform<16, 1, true, VLType::SME>(
- __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
-{
- sme_transpose_interleave_16VL(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
- (xmax-x0) * sizeof(__fp16) / 2,
- stride * sizeof(__fp16),
- (kmax-k0)
- );
-}
-
#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
index dca0031b55..bb866b2983 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
@@ -42,92 +42,92 @@ void sme_transpose_interleave_16VL_1x4(uint8_t *out, const uint8_t *in, size_t w
".inst 0xd503477f // SMSTART ZA\n"
"ptrue p4.b\n"
"1:" // Main row loop: Head
- "mov x27, %x[in]\n"
+ "mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "add x26, x27, %x[in_stride]\n"
- "mov x25, %x[out]\n"
- "add x24, x26, %x[in_stride]\n"
- "mov x23, %x[width]\n"
- "add x22, x24, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
"csel x24, x24, %x[pad_row], GE\n"
- "add %x[in], x22, %x[in_stride]\n"
- "csel x22, x22, %x[pad_row], GT\n"
"cmp %x[height], #0x1\n"
+ "mov x22, %x[out]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"sub %x[height], %x[height], #0x4\n"
- "csel x26, x26, %x[pad_row], GT\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x21, x23\n"
- "mov x20, x25\n"
- "whilelt p3.b, XZR, x21\n"
- "decb x21\n"
- "whilelt p2.b, XZR, x21\n"
- "decb x21\n"
- "ld1b { z21.b }, p3/Z, [x27]\n"
- "whilelt p1.b, XZR, x21\n"
- "decb x21\n"
- "ld1b { z24.b }, p2/Z, [x27, #1, MUL VL]\n"
- "whilelt p0.b, XZR, x21\n"
- "ld1b { z23.b }, p3/Z, [x26]\n"
- "decw x23, ALL, MUL #16\n"
- "ld1b { z20.b }, p2/Z, [x26, #1, MUL VL]\n"
- "cmp x23, #0x0\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1b { z19.b }, p3/Z, [x24]\n"
- "ld1b { z17.b }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1b { z16.b }, p3/Z, [x22]\n"
- "ld1b { z18.b }, p2/Z, [x22, #1, MUL VL]\n"
- "zip1 z22.b, z21.b, z19.b\n"
- "zip2 z21.b, z21.b, z19.b\n"
- "ld1b { z28.b }, p1/Z, [x27, #2, MUL VL]\n"
- "zip1 z1.b, z24.b, z17.b\n"
- "zip2 z0.b, z24.b, z17.b\n"
- "ld1b { z27.b }, p0/Z, [x27, #3, MUL VL]\n"
- "zip1 z17.b, z23.b, z16.b\n"
- "zip2 z16.b, z23.b, z16.b\n"
- "addvl x27, x27, #4\n"
- "ld1b { z26.b }, p1/Z, [x26, #2, MUL VL]\n"
- "zip1 z31.b, z20.b, z18.b\n"
- "zip2 z30.b, z20.b, z18.b\n"
- "ld1b { z25.b }, p0/Z, [x26, #3, MUL VL]\n"
+ "mov x20, x21\n"
+ "whilelt p3.b, XZR, x20\n"
+ "ld1b { z20.b }, p3/Z, [x26]\n"
+ "decb x20\n"
+ "whilelt p2.b, XZR, x20\n"
+ "ld1b { z18.b }, p2/Z, [x26, #1, MUL VL]\n"
+ "decb x20\n"
+ "whilelt p1.b, XZR, x20\n"
+ "ld1b { z17.b }, p3/Z, [x25]\n"
+ "decb x20\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ld1b { z19.b }, p2/Z, [x25, #1, MUL VL]\n"
+ "ld1b { z16.b }, p3/Z, [x24]\n"
+ "zip1 z25.b, z20.b, z16.b\n"
+ "zip2 z24.b, z20.b, z16.b\n"
+ "mov x20, x22\n"
+ "ld1b { z16.b }, p2/Z, [x24, #1, MUL VL]\n"
+ "zip1 z22.b, z18.b, z16.b\n"
+ "zip2 z21.b, z18.b, z16.b\n"
+ "decw x21, ALL, MUL #16\n"
+ "ld1b { z16.b }, p3/Z, [x23]\n"
+ "zip1 z18.b, z17.b, z16.b\n"
+ "zip2 z17.b, z17.b, z16.b\n"
+ "cmp x21, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x23, #1, MUL VL]\n"
+ "zip1 z20.b, z19.b, z16.b\n"
+ "zip2 z16.b, z19.b, z16.b\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1b { z19.b }, p1/Z, [x26, #2, MUL VL]\n"
+ "zip1 z23.b, z25.b, z18.b\n"
+ "zip2 z0.b, z25.b, z18.b\n"
+ "ld1b { z18.b }, p0/Z, [x26, #3, MUL VL]\n"
+ "zip1 z31.b, z24.b, z17.b\n"
+ "zip2 z30.b, z24.b, z17.b\n"
"addvl x26, x26, #4\n"
- "ld1b { z20.b }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1b { z19.b }, p0/Z, [x24, #3, MUL VL]\n"
- "zip1 z18.b, z22.b, z17.b\n"
- "zip2 z24.b, z22.b, z17.b\n"
+ "ld1b { z17.b }, p1/Z, [x25, #2, MUL VL]\n"
+ "zip1 z29.b, z22.b, z20.b\n"
+ "zip2 z28.b, z22.b, z20.b\n"
+ "ld1b { z22.b }, p0/Z, [x25, #3, MUL VL]\n"
+ "zip1 z27.b, z21.b, z16.b\n"
+ "zip2 z26.b, z21.b, z16.b\n"
+ "addvl x25, x25, #4\n"
+ "ld1b { z16.b }, p1/Z, [x24, #2, MUL VL]\n"
+ "zip1 z21.b, z19.b, z16.b\n"
+ "zip2 z20.b, z19.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x24, #3, MUL VL]\n"
+ "zip1 z25.b, z18.b, z16.b\n"
+ "zip2 z24.b, z18.b, z16.b\n"
"addvl x24, x24, #4\n"
- "ld1b { z17.b }, p1/Z, [x22, #2, MUL VL]\n"
- "zip1 z23.b, z21.b, z16.b\n"
- "zip2 z22.b, z21.b, z16.b\n"
- "ld1b { z16.b }, p0/Z, [x22, #3, MUL VL]\n"
- "zip1 z21.b, z28.b, z20.b\n"
- "zip2 z29.b, z28.b, z20.b\n"
- "addvl x22, x22, #4\n"
- "zip1 z28.b, z27.b, z19.b\n"
- "zip2 z27.b, z27.b, z19.b\n"
- "zip1 z20.b, z26.b, z17.b\n"
- "zip2 z19.b, z26.b, z17.b\n"
- "st1b { z18.b }, p4, [x20]\n"
- "zip1 z18.b, z25.b, z16.b\n"
- "zip2 z26.b, z25.b, z16.b\n"
- "st1b { z24.b }, p4, [x20, #1, MUL VL]\n"
- "zip1 z17.b, z1.b, z31.b\n"
- "zip2 z16.b, z1.b, z31.b\n"
- "st1b { z23.b }, p4, [x20, #2, MUL VL]\n"
- "zip1 z25.b, z0.b, z30.b\n"
- "zip2 z24.b, z0.b, z30.b\n"
- "st1b { z22.b }, p4, [x20, #3, MUL VL]\n"
- "zip1 z23.b, z21.b, z20.b\n"
- "zip2 z22.b, z21.b, z20.b\n"
- "zip1 z21.b, z29.b, z19.b\n"
- "zip2 z20.b, z29.b, z19.b\n"
- "st1b { z17.b }, p4, [x20, #4, MUL VL]\n"
- "zip1 z19.b, z28.b, z18.b\n"
- "zip2 z18.b, z28.b, z18.b\n"
- "st1b { z16.b }, p4, [x20, #5, MUL VL]\n"
- "zip1 z17.b, z27.b, z26.b\n"
- "zip2 z16.b, z27.b, z26.b\n"
- "st1b { z25.b }, p4, [x20, #6, MUL VL]\n"
- "st1b { z24.b }, p4, [x20, #7, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x23, #2, MUL VL]\n"
+ "zip1 z19.b, z17.b, z16.b\n"
+ "zip2 z18.b, z17.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, #3, MUL VL]\n"
+ "zip1 z17.b, z22.b, z16.b\n"
+ "zip2 z16.b, z22.b, z16.b\n"
+ "addvl x23, x23, #4\n"
+ "st1b { z23.b }, p4, [x20]\n"
+ "zip1 z23.b, z21.b, z19.b\n"
+ "zip2 z22.b, z21.b, z19.b\n"
+ "st1b { z0.b }, p4, [x20, #1, MUL VL]\n"
+ "zip1 z21.b, z20.b, z18.b\n"
+ "zip2 z20.b, z20.b, z18.b\n"
+ "st1b { z31.b }, p4, [x20, #2, MUL VL]\n"
+ "zip1 z19.b, z25.b, z17.b\n"
+ "zip2 z18.b, z25.b, z17.b\n"
+ "st1b { z30.b }, p4, [x20, #3, MUL VL]\n"
+ "zip1 z17.b, z24.b, z16.b\n"
+ "zip2 z16.b, z24.b, z16.b\n"
+ "st1b { z29.b }, p4, [x20, #4, MUL VL]\n"
+ "st1b { z28.b }, p4, [x20, #5, MUL VL]\n"
+ "st1b { z27.b }, p4, [x20, #6, MUL VL]\n"
+ "st1b { z26.b }, p4, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
"st1b { z23.b }, p4, [x20, #-8, MUL VL]\n"
"st1b { z22.b }, p4, [x20, #-7, MUL VL]\n"
@@ -145,7 +145,7 @@ void sme_transpose_interleave_16VL_1x4(uint8_t *out, const uint8_t *in, size_t w
".inst 0xd503467f // SMSTOP\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
index 896288cdda..0e34bf143b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
@@ -43,133 +43,133 @@ void sme_transpose_interleave_16VL_2x2_fp32bf16(bfloat16 *out, const float *in,
"ptrue p7.b\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
- "cmp %x[height], #0x1\n"
"add x24, x25, %x[in_stride]\n"
- "mov x23, %x[out]\n"
+ "cmp %x[height], #0x1\n"
"add %x[in], x24, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
"csel x24, x24, %x[pad_row], GT\n"
"sub %x[height], %x[height], #0x2\n"
"mov x22, %x[width]\n"
"2:" // Main row loop: Column loop
"mov x21, x22\n"
- "mov x20, x23\n"
"whilelt p1.s, XZR, x21\n"
+ "ld1w { z16.s }, p1/Z, [x25]\n"
+ ".inst 0x658abe00 // bfcvt z0.h, p7/M, z16.s\n"
"decw x21\n"
"whilelt p0.s, XZR, x21\n"
+ "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x658abe1f // bfcvt z31.h, p7/M, z16.s\n"
"decw x21\n"
- "ld1w { z16.s }, p1/Z, [x25]\n"
"whilelt p6.s, XZR, x21\n"
+ "ld1w { z16.s }, p6/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x658abe1e // bfcvt z30.h, p7/M, z16.s\n"
"decw x21\n"
- "ld1w { z18.s }, p0/Z, [x25, #1, MUL VL]\n"
"whilelt p5.s, XZR, x21\n"
+ "ld1w { z16.s }, p5/Z, [x25, #3, MUL VL]\n"
+ ".inst 0x658abe1d // bfcvt z29.h, p7/M, z16.s\n"
"decw x21\n"
- "ld1w { z17.s }, p6/Z, [x25, #2, MUL VL]\n"
"whilelt p4.s, XZR, x21\n"
+ "ld1w { z16.s }, p4/Z, [x25, #4, MUL VL]\n"
+ ".inst 0x658abe1c // bfcvt z28.h, p7/M, z16.s\n"
"decw x21\n"
- "ld1w { z19.s }, p5/Z, [x25, #3, MUL VL]\n"
- ".inst 0x658abe03 // bfcvt z3.h, p7/M, z16.s\n"
"whilelt p3.s, XZR, x21\n"
+ "ld1w { z16.s }, p3/Z, [x25, #5, MUL VL]\n"
+ ".inst 0x658abe1b // bfcvt z27.h, p7/M, z16.s\n"
"decw x21\n"
- "ld1w { z16.s }, p4/Z, [x25, #4, MUL VL]\n"
- ".inst 0x658abe42 // bfcvt z2.h, p7/M, z18.s\n"
"whilelt p2.s, XZR, x21\n"
+ "ld1w { z16.s }, p2/Z, [x25, #6, MUL VL]\n"
+ ".inst 0x658abe1a // bfcvt z26.h, p7/M, z16.s\n"
"decw x21\n"
- "ld1w { z18.s }, p3/Z, [x25, #5, MUL VL]\n"
- ".inst 0x658abe21 // bfcvt z1.h, p7/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x25, #6, MUL VL]\n"
- ".inst 0x658abe60 // bfcvt z0.h, p7/M, z19.s\n"
- "decw x22, ALL, MUL #16\n"
- "add x23, x23, %x[out_stride]\n"
- ".inst 0x658abe1f // bfcvt z31.h, p7/M, z16.s\n"
- "ld1w { z19.s }, p1/Z, [x24]\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
"whilelt p1.s, XZR, x21\n"
+ ".inst 0x648abe00 // bfcvtnt z0.h, p7/M, z16.s\n"
"decw x21\n"
"ld1w { z16.s }, p1/Z, [x25, #7, MUL VL]\n"
"addvl x25, x25, #16\n"
- ".inst 0x658abe5e // bfcvt z30.h, p7/M, z18.s\n"
- ".inst 0x658abe3d // bfcvt z29.h, p7/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x658abe19 // bfcvt z25.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x24, #1, MUL VL]\n"
"whilelt p0.s, XZR, x21\n"
"decw x21\n"
- "ld1w { z17.s }, p0/Z, [x25, #-8, MUL VL]\n"
- ".inst 0x648abe63 // bfcvtnt z3.h, p7/M, z19.s\n"
- ".inst 0x658abe1c // bfcvt z28.h, p7/M, z16.s\n"
- "ld1w { z19.s }, p6/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x648abe1f // bfcvtnt z31.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x25, #-8, MUL VL]\n"
+ ".inst 0x658abe18 // bfcvt z24.h, p7/M, z16.s\n"
+ "mov x20, x23\n"
+ "decw x22, ALL, MUL #16\n"
+ "ld1w { z16.s }, p6/Z, [x24, #2, MUL VL]\n"
"whilelt p6.s, XZR, x21\n"
"decw x21\n"
+ ".inst 0x648abe1e // bfcvtnt z30.h, p7/M, z16.s\n"
"ld1w { z16.s }, p6/Z, [x25, #-7, MUL VL]\n"
- ".inst 0x648abe42 // bfcvtnt z2.h, p7/M, z18.s\n"
- "ld1w { z18.s }, p5/Z, [x24, #3, MUL VL]\n"
+ ".inst 0x658abe17 // bfcvt z23.h, p7/M, z16.s\n"
+ "add x23, x23, %x[out_stride]\n"
+ "ld1w { z16.s }, p5/Z, [x24, #3, MUL VL]\n"
"whilelt p5.s, XZR, x21\n"
"decw x21\n"
- ".inst 0x658abe3b // bfcvt z27.h, p7/M, z17.s\n"
- "ld1w { z17.s }, p5/Z, [x25, #-6, MUL VL]\n"
- ".inst 0x648abe61 // bfcvtnt z1.h, p7/M, z19.s\n"
- "ld1w { z19.s }, p4/Z, [x24, #4, MUL VL]\n"
+ ".inst 0x648abe1d // bfcvtnt z29.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p5/Z, [x25, #-6, MUL VL]\n"
+ ".inst 0x658abe16 // bfcvt z22.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p4/Z, [x24, #4, MUL VL]\n"
"whilelt p4.s, XZR, x21\n"
"decw x21\n"
- ".inst 0x658abe1a // bfcvt z26.h, p7/M, z16.s\n"
+ ".inst 0x648abe1c // bfcvtnt z28.h, p7/M, z16.s\n"
"ld1w { z16.s }, p4/Z, [x25, #-5, MUL VL]\n"
- ".inst 0x648abe40 // bfcvtnt z0.h, p7/M, z18.s\n"
- "ld1w { z18.s }, p3/Z, [x24, #5, MUL VL]\n"
+ ".inst 0x658abe15 // bfcvt z21.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x24, #5, MUL VL]\n"
"whilelt p3.s, XZR, x21\n"
"decw x21\n"
- ".inst 0x658abe39 // bfcvt z25.h, p7/M, z17.s\n"
- "ld1w { z17.s }, p3/Z, [x25, #-4, MUL VL]\n"
- ".inst 0x648abe7f // bfcvtnt z31.h, p7/M, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x24, #6, MUL VL]\n"
+ ".inst 0x648abe1b // bfcvtnt z27.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x25, #-4, MUL VL]\n"
+ ".inst 0x658abe14 // bfcvt z20.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, #6, MUL VL]\n"
"whilelt p2.s, XZR, x21\n"
"decw x21\n"
- ".inst 0x658abe18 // bfcvt z24.h, p7/M, z16.s\n"
+ ".inst 0x648abe1a // bfcvtnt z26.h, p7/M, z16.s\n"
"ld1w { z16.s }, p2/Z, [x25, #-3, MUL VL]\n"
- ".inst 0x648abe5e // bfcvtnt z30.h, p7/M, z18.s\n"
- "ld1w { z18.s }, p1/Z, [x24, #7, MUL VL]\n"
+ ".inst 0x658abe13 // bfcvt z19.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x24, #7, MUL VL]\n"
"whilelt p1.s, XZR, x21\n"
"decw x21\n"
- ".inst 0x658abe37 // bfcvt z23.h, p7/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x25, #-2, MUL VL]\n"
+ ".inst 0x648abe19 // bfcvtnt z25.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #-2, MUL VL]\n"
"addvl x24, x24, #16\n"
- ".inst 0x648abe7d // bfcvtnt z29.h, p7/M, z19.s\n"
- ".inst 0x658abe16 // bfcvt z22.h, p7/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x24, #-8, MUL VL]\n"
+ ".inst 0x658abe12 // bfcvt z18.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x24, #-8, MUL VL]\n"
"whilelt p0.s, XZR, x21\n"
"cmp x22, #0x0\n"
+ ".inst 0x648abe18 // bfcvtnt z24.h, p7/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x25, #-1, MUL VL]\n"
- ".inst 0x648abe5c // bfcvtnt z28.h, p7/M, z18.s\n"
- ".inst 0x658abe35 // bfcvt z21.h, p7/M, z17.s\n"
- "ld1w { z18.s }, p6/Z, [x24, #-7, MUL VL]\n"
- "ld1w { z17.s }, p5/Z, [x24, #-6, MUL VL]\n"
- ".inst 0x648abe7b // bfcvtnt z27.h, p7/M, z19.s\n"
- ".inst 0x658abe14 // bfcvt z20.h, p7/M, z16.s\n"
+ ".inst 0x658abe11 // bfcvt z17.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p6/Z, [x24, #-7, MUL VL]\n"
+ ".inst 0x648abe17 // bfcvtnt z23.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p5/Z, [x24, #-6, MUL VL]\n"
+ ".inst 0x648abe16 // bfcvtnt z22.h, p7/M, z16.s\n"
"ld1w { z16.s }, p4/Z, [x24, #-5, MUL VL]\n"
- "ld1w { z19.s }, p3/Z, [x24, #-4, MUL VL]\n"
- ".inst 0x648abe5a // bfcvtnt z26.h, p7/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x24, #-3, MUL VL]\n"
- ".inst 0x648abe39 // bfcvtnt z25.h, p7/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x24, #-2, MUL VL]\n"
- ".inst 0x648abe18 // bfcvtnt z24.h, p7/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x24, #-1, MUL VL]\n"
- "st1h { z3.h }, p7, [x20]\n"
- ".inst 0x648abe77 // bfcvtnt z23.h, p7/M, z19.s\n"
- "st1h { z2.h }, p7, [x20, #1, MUL VL]\n"
- ".inst 0x648abe56 // bfcvtnt z22.h, p7/M, z18.s\n"
- "st1h { z1.h }, p7, [x20, #2, MUL VL]\n"
- ".inst 0x648abe35 // bfcvtnt z21.h, p7/M, z17.s\n"
- "st1h { z0.h }, p7, [x20, #3, MUL VL]\n"
+ ".inst 0x648abe15 // bfcvtnt z21.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x24, #-4, MUL VL]\n"
".inst 0x648abe14 // bfcvtnt z20.h, p7/M, z16.s\n"
- "st1h { z31.h }, p7, [x20, #4, MUL VL]\n"
- "st1h { z30.h }, p7, [x20, #5, MUL VL]\n"
- "st1h { z29.h }, p7, [x20, #6, MUL VL]\n"
- "st1h { z28.h }, p7, [x20, #7, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x24, #-3, MUL VL]\n"
+ ".inst 0x648abe13 // bfcvtnt z19.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x24, #-2, MUL VL]\n"
+ ".inst 0x648abe12 // bfcvtnt z18.h, p7/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x24, #-1, MUL VL]\n"
+ "st1h { z0.h }, p7, [x20]\n"
+ ".inst 0x648abe11 // bfcvtnt z17.h, p7/M, z16.s\n"
+ "st1h { z31.h }, p7, [x20, #1, MUL VL]\n"
+ "st1h { z30.h }, p7, [x20, #2, MUL VL]\n"
+ "st1h { z29.h }, p7, [x20, #3, MUL VL]\n"
+ "st1h { z28.h }, p7, [x20, #4, MUL VL]\n"
+ "st1h { z27.h }, p7, [x20, #5, MUL VL]\n"
+ "st1h { z26.h }, p7, [x20, #6, MUL VL]\n"
+ "st1h { z25.h }, p7, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1h { z27.h }, p7, [x20, #-8, MUL VL]\n"
- "st1h { z26.h }, p7, [x20, #-7, MUL VL]\n"
- "st1h { z25.h }, p7, [x20, #-6, MUL VL]\n"
- "st1h { z24.h }, p7, [x20, #-5, MUL VL]\n"
- "st1h { z23.h }, p7, [x20, #-4, MUL VL]\n"
- "st1h { z22.h }, p7, [x20, #-3, MUL VL]\n"
- "st1h { z21.h }, p7, [x20, #-2, MUL VL]\n"
- "st1h { z20.h }, p7, [x20, #-1, MUL VL]\n"
+ "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n"
+ "st1h { z23.h }, p7, [x20, #-7, MUL VL]\n"
+ "st1h { z22.h }, p7, [x20, #-6, MUL VL]\n"
+ "st1h { z21.h }, p7, [x20, #-5, MUL VL]\n"
+ "st1h { z20.h }, p7, [x20, #-4, MUL VL]\n"
+ "st1h { z19.h }, p7, [x20, #-3, MUL VL]\n"
+ "st1h { z18.h }, p7, [x20, #-2, MUL VL]\n"
+ "st1h { z17.h }, p7, [x20, #-1, MUL VL]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
index 1ece4005d6..36b364a57b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
@@ -39,34 +39,34 @@ void sme_transpose_interleave_1VL(uint16_t *out, const uint16_t *in, size_t widt
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
- "add x24, x26, %x[in_stride]\n"
- "cnth x23, ALL, MUL #4\n"
- "add x21, x24, %x[in_stride]\n"
- "cmp x25, x23\n"
- "add x20, x21, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
+ "cnth x21, ALL, MUL #4\n"
+ "add x20, x24, %x[in_stride]\n"
+ "cmp x23, x21\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
"ld1h { z31.h }, p1/Z, [x26]\n"
- "sub x25, x25, x23\n"
+ "sub x23, x23, x21\n"
+ "cmp x23, x21\n"
"ld1h { z30.h }, p1/Z, [x26, #1, MUL VL]\n"
- "cmp x25, x23\n"
"ld1h { z29.h }, p1/Z, [x26, #2, MUL VL]\n"
"ld1h { z28.h }, p1/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- "ld1h { z27.h }, p1/Z, [x24]\n"
- "ld1h { z26.h }, p1/Z, [x24, #1, MUL VL]\n"
- "ld1h { z25.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z24.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x25]\n"
+ "ld1h { z26.h }, p1/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z25.h }, p1/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z24.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "ld1h { z23.h }, p1/Z, [x24]\n"
+ "ld1h { z22.h }, p1/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z21.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z20.h }, p1/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- "ld1h { z23.h }, p1/Z, [x21]\n"
- "ld1h { z22.h }, p1/Z, [x21, #1, MUL VL]\n"
- "ld1h { z21.h }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1h { z20.h }, p1/Z, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
"ld1h { z19.h }, p1/Z, [x20]\n"
"ld1h { z18.h }, p1/Z, [x20, #1, MUL VL]\n"
"ld1h { z17.h }, p1/Z, [x20, #2, MUL VL]\n"
@@ -94,17 +94,17 @@ void sme_transpose_interleave_1VL(uint16_t *out, const uint16_t *in, size_t widt
"add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.h, XZR, x25\n"
- "dech x25\n"
- "cmp x25, #0x0\n"
+ "whilelt p0.h, XZR, x23\n"
+ "dech x23\n"
"ld1h { z19.h }, p0/Z, [x26]\n"
+ "cmp x23, #0x0\n"
"addvl x26, x26, #1\n"
- "ld1h { z18.h }, p0/Z, [x24]\n"
+ "ld1h { z18.h }, p0/Z, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "ld1h { z17.h }, p0/Z, [x24]\n"
"addvl x24, x24, #1\n"
- "ld1h { z17.h }, p0/Z, [x21]\n"
- "addvl x21, x21, #1\n"
"ld1h { z16.h }, p0/Z, [x20]\n"
"addvl x20, x20, #1\n"
"st1h { z19.h }, p1, [x22]\n"
@@ -131,8 +131,8 @@ void sme_transpose_interleave_1VL(uint16_t *out, const uint16_t *in, size_t widt
"8:" // Tail row loop: Unroll column loop
"ld1h { z19.h }, p1/Z, [x26]\n"
"sub x21, x21, x20\n"
- "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
"cmp x21, x20\n"
+ "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
"ld1h { z17.h }, p1/Z, [x26, #2, MUL VL]\n"
"ld1h { z16.h }, p1/Z, [x26, #3, MUL VL]\n"
"st1h { z19.h }, p1, [x22]\n"
@@ -150,10 +150,10 @@ void sme_transpose_interleave_1VL(uint16_t *out, const uint16_t *in, size_t widt
"10:" // Tail row loop: Column loop
"whilelt p0.h, XZR, x21\n"
"dech x21\n"
- "cmp x21, #0x0\n"
"ld1h { z16.h }, p0/Z, [x26]\n"
- "addvl x26, x26, #1\n"
"st1h { z16.h }, p1, [x22]\n"
+ "cmp x21, #0x0\n"
+ "addvl x26, x26, #1\n"
"add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
index ff8cfc7efe..d67e353f18 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
@@ -43,65 +43,65 @@ void sme_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p1.b\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "cmp %x[height], #0x3\n"
"add x25, x26, %x[in_stride]\n"
- "mov x24, %x[width]\n"
- "add x23, x25, %x[in_stride]\n"
- "cntb x22\n"
- "add x21, x23, %x[in_stride]\n"
- "csel x23, x23, %x[pad_row], GE\n"
- "add %x[in], x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GT\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x3\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "mov x20, %x[out]\n"
+ "mov x22, %x[width]\n"
+ "cntb x21\n"
"csel x25, x25, %x[pad_row], GT\n"
- "cmp x24, x22\n"
+ "cmp x22, x21\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z20.b }, p1/Z, [x26]\n"
- "sub x24, x24, x22\n"
+ "ld1b { z17.b }, p1/Z, [x26]\n"
+ "sub x22, x22, x21\n"
+ "cmp x22, x21\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
"addvl x26, x26, #1\n"
- "ld1b { z19.b }, p1/Z, [x25]\n"
- "cmp x24, x22\n"
"addvl x25, x25, #1\n"
- "ld1b { z17.b }, p1/Z, [x23]\n"
+ "ld1b { z16.b }, p1/Z, [x24]\n"
+ "zip1 z20.b, z17.b, z16.b\n"
+ "zip2 z19.b, z17.b, z16.b\n"
+ "addvl x24, x24, #1\n"
+ "ld1b { z16.b }, p1/Z, [x23]\n"
+ "zip1 z17.b, z18.b, z16.b\n"
+ "zip2 z18.b, z18.b, z16.b\n"
"addvl x23, x23, #1\n"
- "ld1b { z16.b }, p1/Z, [x21]\n"
- "addvl x21, x21, #1\n"
- "zip1 z18.b, z20.b, z17.b\n"
- "zip2 z20.b, z20.b, z17.b\n"
- "zip1 z17.b, z19.b, z16.b\n"
- "zip2 z16.b, z19.b, z16.b\n"
- "zip1 z19.b, z18.b, z17.b\n"
- "zip2 z18.b, z18.b, z17.b\n"
- "zip1 z17.b, z20.b, z16.b\n"
- "zip2 z16.b, z20.b, z16.b\n"
- "st1b { z19.b }, p1, [x20]\n"
+ "zip1 z16.b, z20.b, z17.b\n"
+ "st1b { z16.b }, p1, [x20]\n"
"add x20, x20, %x[out_stride]\n"
- "st1b { z18.b }, p1, [x20]\n"
+ "zip2 z16.b, z20.b, z17.b\n"
+ "st1b { z16.b }, p1, [x20]\n"
"add x20, x20, %x[out_stride]\n"
+ "zip1 z17.b, z19.b, z18.b\n"
+ "zip2 z16.b, z19.b, z18.b\n"
"st1b { z17.b }, p1, [x20]\n"
"add x20, x20, %x[out_stride]\n"
"st1b { z16.b }, p1, [x20]\n"
"add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x24, 5f\n"
+ "cbz x22, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x24\n"
- "decw x24\n"
- "ld1b { z19.b }, p0/Z, [x26]\n"
- "cmp x24, #0x0\n"
- "incd x26, ALL, MUL #2\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1b { z17.b }, p0/Z, [x26]\n"
+ "decw x22\n"
"ld1b { z18.b }, p0/Z, [x25]\n"
+ "cmp x22, #0x0\n"
+ "incd x26, ALL, MUL #2\n"
+ "ld1b { z16.b }, p0/Z, [x24]\n"
+ "zip1 z17.b, z17.b, z16.b\n"
"incd x25, ALL, MUL #2\n"
- "ld1b { z17.b }, p0/Z, [x23]\n"
- "incd x23, ALL, MUL #2\n"
- "ld1b { z16.b }, p0/Z, [x21]\n"
- "incd x21, ALL, MUL #2\n"
- "zip1 z17.b, z19.b, z17.b\n"
+ "incd x24, ALL, MUL #2\n"
+ "ld1b { z16.b }, p0/Z, [x23]\n"
"zip1 z16.b, z18.b, z16.b\n"
+ "incd x23, ALL, MUL #2\n"
"zip1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p1, [x20]\n"
"add x20, x20, %x[out_stride]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
index 54c2af1a84..f8980d25f6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
@@ -45,69 +45,69 @@ void sme_transpose_interleave_1VL_2x2(uint16_t *out, const uint16_t *in, size_t
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
- "add x24, x26, %x[in_stride]\n"
- "cnth x23, ALL, MUL #2\n"
- "add x21, x24, %x[in_stride]\n"
- "cmp x25, x23\n"
- "add x20, x21, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
+ "cnth x21, ALL, MUL #2\n"
+ "add x20, x24, %x[in_stride]\n"
+ "cmp x23, x21\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z18.h }, p1/Z, [x26]\n"
- "sub x25, x25, x23\n"
+ "ld1h { z17.h }, p1/Z, [x26]\n"
+ "sub x23, x23, x21\n"
+ "cmp x23, x21\n"
+ "ld1h { z16.h }, p1/Z, [x25]\n"
+ "zip1 z24.h, z17.h, z16.h\n"
+ "zip2 z23.h, z17.h, z16.h\n"
"ld1h { z17.h }, p1/Z, [x24]\n"
- "cmp x25, x23\n"
- "ld1h { z20.h }, p1/Z, [x21]\n"
"ld1h { z16.h }, p1/Z, [x20]\n"
- "ld1h { z23.h }, p1/Z, [x26, #1, MUL VL]\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z22.h, z18.h, z17.h\n"
+ "zip1 z22.h, z17.h, z16.h\n"
+ "zip2 z21.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p1/Z, [x26, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
+ "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+ "zip1 z20.h, z17.h, z16.h\n"
+ "addvl x25, x25, #2\n"
+ "zip2 z19.h, z17.h, z16.h\n"
"ld1h { z18.h }, p1/Z, [x24, #1, MUL VL]\n"
"addvl x24, x24, #2\n"
- "ld1h { z21.h }, p1/Z, [x21, #1, MUL VL]\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z20.h, z20.h, z16.h\n"
- "addvl x21, x21, #2\n"
"ld1h { z16.h }, p1/Z, [x20, #1, MUL VL]\n"
+ "st1h { z24.h }, p1, [x22]\n"
+ "zip1 z17.h, z18.h, z16.h\n"
"addvl x20, x20, #2\n"
- "st1h { z19.h }, p1, [x22]\n"
- "zip1 z19.h, z23.h, z18.h\n"
- "zip2 z18.h, z23.h, z18.h\n"
- "st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- "zip1 z17.h, z21.h, z16.h\n"
- "zip2 z16.h, z21.h, z16.h\n"
- "st1h { z22.h }, p1, [x22]\n"
- "st1h { z20.h }, p1, [x22, #1, MUL VL]\n"
+ "zip2 z16.h, z18.h, z16.h\n"
+ "st1h { z23.h }, p1, [x22]\n"
+ "st1h { z21.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- "st1h { z19.h }, p1, [x22]\n"
+ "st1h { z20.h }, p1, [x22]\n"
"st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- "st1h { z18.h }, p1, [x22]\n"
+ "st1h { z19.h }, p1, [x22]\n"
"st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.h, XZR, x25\n"
- "decw x25\n"
- "ld1h { z19.h }, p0/Z, [x26]\n"
- "cmp x25, #0x0\n"
+ "whilelt p0.h, XZR, x23\n"
+ "ld1h { z17.h }, p0/Z, [x26]\n"
+ "decw x23\n"
+ "ld1h { z16.h }, p0/Z, [x25]\n"
+ "cmp x23, #0x0\n"
"incd x26, ALL, MUL #4\n"
+ "zip1 z18.h, z17.h, z16.h\n"
"ld1h { z17.h }, p0/Z, [x24]\n"
+ "incd x25, ALL, MUL #4\n"
"incd x24, ALL, MUL #4\n"
- "ld1h { z18.h }, p0/Z, [x21]\n"
- "incd x21, ALL, MUL #4\n"
"ld1h { z16.h }, p0/Z, [x20]\n"
"incd x20, ALL, MUL #4\n"
- "zip1 z17.h, z19.h, z17.h\n"
- "zip1 z16.h, z18.h, z16.h\n"
- "st1h { z17.h }, p1, [x22]\n"
+ "zip1 z16.h, z17.h, z16.h\n"
+ "st1h { z18.h }, p1, [x22]\n"
"st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
@@ -119,12 +119,12 @@ void sme_transpose_interleave_1VL_2x2(uint16_t *out, const uint16_t *in, size_t
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "add x24, x26, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #2\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"cmp x21, x20\n"
"mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x2\n"
@@ -132,20 +132,20 @@ void sme_transpose_interleave_1VL_2x2(uint16_t *out, const uint16_t *in, size_t
"8:" // Tail row loop: Unroll column loop
"ld1h { z18.h }, p1/Z, [x26]\n"
"sub x21, x21, x20\n"
- "ld1h { z17.h }, p1/Z, [x24]\n"
"cmp x21, x20\n"
- "ld1h { z20.h }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x25]\n"
+ "zip1 z17.h, z18.h, z16.h\n"
+ "zip2 z19.h, z18.h, z16.h\n"
+ "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
- "ld1h { z16.h }, p1/Z, [x24, #1, MUL VL]\n"
- "addvl x24, x24, #2\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z16.h, z20.h, z16.h\n"
- "st1h { z19.h }, p1, [x22]\n"
+ "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+ "st1h { z17.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
- "st1h { z18.h }, p1, [x22]\n"
+ "zip1 z17.h, z18.h, z16.h\n"
+ "st1h { z19.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
+ "addvl x25, x25, #2\n"
+ "zip2 z16.h, z18.h, z16.h\n"
"st1h { z17.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
"st1h { z16.h }, p1, [x22]\n"
@@ -155,13 +155,13 @@ void sme_transpose_interleave_1VL_2x2(uint16_t *out, const uint16_t *in, size_t
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.h, XZR, x21\n"
- "decw x21\n"
"ld1h { z17.h }, p0/Z, [x26]\n"
+ "decw x21\n"
+ "ld1h { z16.h }, p0/Z, [x25]\n"
"cmp x21, #0x0\n"
"incd x26, ALL, MUL #4\n"
- "ld1h { z16.h }, p0/Z, [x24]\n"
- "incd x24, ALL, MUL #4\n"
"zip1 z16.h, z17.h, z16.h\n"
+ "incd x25, ALL, MUL #4\n"
"st1h { z16.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
@@ -205,5 +205,4 @@ void Transform<1, 2, true, VLType::SME>(
);
}
-
#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
index 2fafefbbc5..c740a9c64b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
@@ -45,88 +45,88 @@ void sme_transpose_interleave_1VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
- "add x24, x26, %x[in_stride]\n"
- "cnth x23, ALL, MUL #2\n"
- "add x21, x24, %x[in_stride]\n"
- "cmp x25, x23\n"
- "add x20, x21, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
+ "cnth x21, ALL, MUL #2\n"
+ "add x20, x24, %x[in_stride]\n"
+ "cmp x23, x21\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z19.s }, p1/Z, [x26]\n"
- "sub x25, x25, x23\n"
- "ld1w { z18.s }, p1/Z, [x21]\n"
- "cmp x25, x23\n"
- "ld1w { z17.s }, p1/Z, [x26, #1, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- ".inst 0x658aa67b // bfcvt z27.h, p1/M, z19.s\n"
- "ld1w { z19.s }, p1/Z, [x26, #2, MUL VL]\n"
- ".inst 0x658aa65a // bfcvt z26.h, p1/M, z18.s\n"
- "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
- ".inst 0x658aa639 // bfcvt z25.h, p1/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x26]\n"
".inst 0x658aa618 // bfcvt z24.h, p1/M, z16.s\n"
- "addvl x26, x26, #4\n"
- "ld1w { z16.s }, p1/Z, [x21, #3, MUL VL]\n"
- ".inst 0x658aa677 // bfcvt z23.h, p1/M, z19.s\n"
- "addvl x21, x21, #4\n"
- "ld1w { z19.s }, p1/Z, [x24]\n"
- ".inst 0x658aa656 // bfcvt z22.h, p1/M, z18.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- ".inst 0x658aa635 // bfcvt z21.h, p1/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
+ "sub x23, x23, x21\n"
+ "cmp x23, x21\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ ".inst 0x658aa617 // bfcvt z23.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658aa616 // bfcvt z22.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x658aa615 // bfcvt z21.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
".inst 0x658aa614 // bfcvt z20.h, p1/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
- ".inst 0x648aa67b // bfcvtnt z27.h, p1/M, z19.s\n"
- "ld1w { z19.s }, p1/Z, [x24, #2, MUL VL]\n"
- ".inst 0x648aa65a // bfcvtnt z26.h, p1/M, z18.s\n"
- "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
- ".inst 0x648aa639 // bfcvtnt z25.h, p1/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x24, #3, MUL VL]\n"
- ".inst 0x648aa618 // bfcvtnt z24.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x658aa613 // bfcvt z19.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x26, #3, MUL VL]\n"
+ ".inst 0x658aa612 // bfcvt z18.h, p1/M, z16.s\n"
+ "addvl x26, x26, #4\n"
+ "ld1w { z16.s }, p1/Z, [x24, #3, MUL VL]\n"
+ ".inst 0x658aa611 // bfcvt z17.h, p1/M, z16.s\n"
"addvl x24, x24, #4\n"
+ "ld1w { z16.s }, p1/Z, [x25]\n"
+ ".inst 0x648aa618 // bfcvtnt z24.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa617 // bfcvtnt z23.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x648aa616 // bfcvtnt z22.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ ".inst 0x648aa615 // bfcvtnt z21.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648aa614 // bfcvtnt z20.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, #2, MUL VL]\n"
+ ".inst 0x648aa613 // bfcvtnt z19.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0x648aa612 // bfcvtnt z18.h, p1/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x20, #3, MUL VL]\n"
- "st1h { z27.h }, p1, [x22]\n"
- ".inst 0x648aa677 // bfcvtnt z23.h, p1/M, z19.s\n"
+ "st1h { z24.h }, p1, [x22]\n"
"addvl x20, x20, #4\n"
- "st1h { z26.h }, p1, [x22, #1, MUL VL]\n"
+ ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n"
+ "st1h { z23.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- ".inst 0x648aa656 // bfcvtnt z22.h, p1/M, z18.s\n"
- "st1h { z25.h }, p1, [x22]\n"
- ".inst 0x648aa635 // bfcvtnt z21.h, p1/M, z17.s\n"
- "st1h { z24.h }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p1, [x22]\n"
+ "st1h { z21.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- ".inst 0x648aa614 // bfcvtnt z20.h, p1/M, z16.s\n"
- "st1h { z23.h }, p1, [x22]\n"
- "st1h { z22.h }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z20.h }, p1, [x22]\n"
+ "st1h { z19.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- "st1h { z21.h }, p1, [x22]\n"
- "st1h { z20.h }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z18.h }, p1, [x22]\n"
+ "st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.s, XZR, x25\n"
- "decw x25\n"
- "ld1w { z17.s }, p0/Z, [x26]\n"
- "cmp x25, #0x0\n"
+ "whilelt p0.s, XZR, x23\n"
+ "ld1w { z16.s }, p0/Z, [x26]\n"
+ ".inst 0x658aa612 // bfcvt z18.h, p1/M, z16.s\n"
+ "decw x23\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x658aa611 // bfcvt z17.h, p1/M, z16.s\n"
+ "cmp x23, #0x0\n"
"addvl x26, x26, #1\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "addvl x21, x21, #1\n"
- "ld1w { z19.s }, p0/Z, [x24]\n"
+ "ld1w { z16.s }, p0/Z, [x25]\n"
+ "addvl x25, x25, #1\n"
"addvl x24, x24, #1\n"
- ".inst 0x658aa632 // bfcvt z18.h, p1/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa612 // bfcvtnt z18.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"addvl x20, x20, #1\n"
- ".inst 0x658aa610 // bfcvt z16.h, p1/M, z16.s\n"
- ".inst 0x648aa672 // bfcvtnt z18.h, p1/M, z19.s\n"
- ".inst 0x648aa630 // bfcvtnt z16.h, p1/M, z17.s\n"
+ ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n"
"st1h { z18.h }, p1, [x22]\n"
- "st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
@@ -137,43 +137,43 @@ void sme_transpose_interleave_1VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "add x24, x26, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #2\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"cmp x21, x20\n"
"mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x2\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1w { z19.s }, p1/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26]\n"
+ ".inst 0x658aa614 // bfcvt z20.h, p1/M, z16.s\n"
"sub x21, x21, x20\n"
- "ld1w { z18.s }, p1/Z, [x26, #1, MUL VL]\n"
"cmp x21, x20\n"
- "ld1w { z17.s }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658aa613 // bfcvt z19.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+ ".inst 0x658aa612 // bfcvt z18.h, p1/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x26, #3, MUL VL]\n"
- ".inst 0x658aa677 // bfcvt z23.h, p1/M, z19.s\n"
- "addvl x26, x26, #4\n"
- "ld1w { z22.s }, p1/Z, [x24]\n"
- ".inst 0x658aa655 // bfcvt z21.h, p1/M, z18.s\n"
- "ld1w { z20.s }, p1/Z, [x24, #1, MUL VL]\n"
- ".inst 0x658aa633 // bfcvt z19.h, p1/M, z17.s\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
".inst 0x658aa611 // bfcvt z17.h, p1/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x24, #3, MUL VL]\n"
- ".inst 0x648aa6d7 // bfcvtnt z23.h, p1/M, z22.s\n"
- "addvl x24, x24, #4\n"
- ".inst 0x648aa695 // bfcvtnt z21.h, p1/M, z20.s\n"
- ".inst 0x648aa653 // bfcvtnt z19.h, p1/M, z18.s\n"
- ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n"
- "st1h { z23.h }, p1, [x22]\n"
- "add x22, x22, %x[out_stride]\n"
- "st1h { z21.h }, p1, [x22]\n"
+ "addvl x26, x26, #4\n"
+ "ld1w { z16.s }, p1/Z, [x25]\n"
+ ".inst 0x648aa614 // bfcvtnt z20.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x648aa613 // bfcvtnt z19.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648aa612 // bfcvtnt z18.h, p1/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "st1h { z20.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
+ "addvl x25, x25, #4\n"
"st1h { z19.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
+ ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n"
+ "st1h { z18.h }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
"st1h { z17.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
"bge 8b\n"
@@ -181,15 +181,15 @@ void sme_transpose_interleave_1VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.s, XZR, x21\n"
- "decw x21\n"
"ld1w { z16.s }, p0/Z, [x26]\n"
+ ".inst 0x658aa611 // bfcvt z17.h, p1/M, z16.s\n"
+ "decw x21\n"
+ "ld1w { z16.s }, p0/Z, [x25]\n"
"cmp x21, #0x0\n"
"addvl x26, x26, #1\n"
- "ld1w { z17.s }, p0/Z, [x24]\n"
- "addvl x24, x24, #1\n"
- ".inst 0x658aa610 // bfcvt z16.h, p1/M, z16.s\n"
- ".inst 0x648aa630 // bfcvtnt z16.h, p1/M, z17.s\n"
- "st1h { z16.h }, p1, [x22]\n"
+ ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n"
+ "addvl x25, x25, #1\n"
+ "st1h { z17.h }, p1, [x22]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
index f981624a1d..f7d29a9f01 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
@@ -39,34 +39,34 @@ void sme_transpose_interleave_2VL(uint16_t *out, const uint16_t *in, size_t widt
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
- "add x24, x26, %x[in_stride]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
"cnth x20, ALL, MUL #4\n"
- "add x23, x24, %x[in_stride]\n"
- "cmp x25, x20\n"
- "add x21, x23, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "add x21, x24, %x[in_stride]\n"
+ "cmp x23, x20\n"
"add %x[in], x21, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "sub x25, x25, x20\n"
+ "sub x23, x23, x20\n"
"ld1h { z31.h }, p2/Z, [x26]\n"
+ "cmp x23, x20\n"
"ld1h { z30.h }, p2/Z, [x26, #1, MUL VL]\n"
- "cmp x25, x20\n"
"ld1h { z29.h }, p2/Z, [x26, #2, MUL VL]\n"
"ld1h { z28.h }, p2/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- "ld1h { z27.h }, p2/Z, [x24]\n"
- "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z25.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z24.h }, p2/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z27.h }, p2/Z, [x25]\n"
+ "ld1h { z26.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z25.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z24.h }, p2/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "ld1h { z23.h }, p2/Z, [x24]\n"
+ "ld1h { z22.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z21.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z20.h }, p2/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- "ld1h { z23.h }, p2/Z, [x23]\n"
- "ld1h { z22.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z21.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z20.h }, p2/Z, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
"ld1h { z19.h }, p2/Z, [x21]\n"
"ld1h { z18.h }, p2/Z, [x21, #1, MUL VL]\n"
"ld1h { z17.h }, p2/Z, [x21, #2, MUL VL]\n"
@@ -92,29 +92,29 @@ void sme_transpose_interleave_2VL(uint16_t *out, const uint16_t *in, size_t widt
"add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "mov x20, x25\n"
- "dech x25, ALL, MUL #2\n"
+ "mov x20, x23\n"
"whilelt p1.h, XZR, x20\n"
+ "ld1h { z23.h }, p1/Z, [x26]\n"
"dech x20\n"
+ "dech x23, ALL, MUL #2\n"
+ "ld1h { z22.h }, p1/Z, [x25]\n"
"whilelt p0.h, XZR, x20\n"
- "cmp x25, #0x0\n"
- "ld1h { z23.h }, p1/Z, [x26]\n"
- "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+ "cmp x23, #0x0\n"
+ "ld1h { z21.h }, p0/Z, [x26, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
- "ld1h { z21.h }, p1/Z, [x24]\n"
- "ld1h { z20.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z20.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ "ld1h { z19.h }, p1/Z, [x24]\n"
+ "ld1h { z18.h }, p0/Z, [x24, #1, MUL VL]\n"
"addvl x24, x24, #2\n"
- "ld1h { z19.h }, p1/Z, [x23]\n"
- "ld1h { z18.h }, p0/Z, [x23, #1, MUL VL]\n"
- "addvl x23, x23, #2\n"
"ld1h { z17.h }, p1/Z, [x21]\n"
"ld1h { z16.h }, p0/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
"st1h { z23.h }, p2, [x22]\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z21.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
"st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
"st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
"st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
@@ -140,8 +140,8 @@ void sme_transpose_interleave_2VL(uint16_t *out, const uint16_t *in, size_t widt
"8:" // Tail row loop: Unroll column loop
"sub x21, x21, x20\n"
"ld1h { z19.h }, p2/Z, [x26]\n"
- "ld1h { z18.h }, p2/Z, [x26, #1, MUL VL]\n"
"cmp x21, x20\n"
+ "ld1h { z18.h }, p2/Z, [x26, #1, MUL VL]\n"
"ld1h { z17.h }, p2/Z, [x26, #2, MUL VL]\n"
"ld1h { z16.h }, p2/Z, [x26, #3, MUL VL]\n"
"st1h { z19.h }, p2, [x22]\n"
@@ -156,15 +156,15 @@ void sme_transpose_interleave_2VL(uint16_t *out, const uint16_t *in, size_t widt
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "dech x21, ALL, MUL #2\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x26]\n"
"dech x20\n"
+ "dech x21, ALL, MUL #2\n"
"whilelt p0.h, XZR, x20\n"
"cmp x21, #0x0\n"
- "ld1h { z17.h }, p1/Z, [x26]\n"
"ld1h { z16.h }, p0/Z, [x26, #1, MUL VL]\n"
- "addvl x26, x26, #2\n"
"st1h { z17.h }, p2, [x22]\n"
+ "addvl x26, x26, #2\n"
"st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
index a7a384c85f..f07d34f46c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
@@ -43,63 +43,63 @@ void sme_transpose_interleave_2VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p1.b\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "cmp %x[height], #0x3\n"
"add x25, x26, %x[in_stride]\n"
- "mov x24, %x[width]\n"
- "add x23, x25, %x[in_stride]\n"
- "cntb x22\n"
- "add x21, x23, %x[in_stride]\n"
- "csel x23, x23, %x[pad_row], GE\n"
- "add %x[in], x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GT\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x3\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "mov x20, %x[out]\n"
+ "mov x22, %x[width]\n"
+ "cntb x21\n"
"csel x25, x25, %x[pad_row], GT\n"
- "cmp x24, x22\n"
+ "cmp x22, x21\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z20.b }, p1/Z, [x26]\n"
- "sub x24, x24, x22\n"
+ "ld1b { z17.b }, p1/Z, [x26]\n"
+ "sub x22, x22, x21\n"
+ "cmp x22, x21\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
"addvl x26, x26, #1\n"
- "ld1b { z19.b }, p1/Z, [x25]\n"
- "cmp x24, x22\n"
"addvl x25, x25, #1\n"
+ "ld1b { z16.b }, p1/Z, [x24]\n"
+ "zip1 z20.b, z17.b, z16.b\n"
+ "zip2 z19.b, z17.b, z16.b\n"
+ "addvl x24, x24, #1\n"
"ld1b { z17.b }, p1/Z, [x23]\n"
- "addvl x23, x23, #1\n"
- "ld1b { z16.b }, p1/Z, [x21]\n"
- "addvl x21, x21, #1\n"
- "zip1 z18.b, z20.b, z17.b\n"
- "zip2 z20.b, z20.b, z17.b\n"
- "zip1 z17.b, z19.b, z16.b\n"
- "zip2 z16.b, z19.b, z16.b\n"
- "zip1 z19.b, z18.b, z17.b\n"
+ "zip1 z16.b, z18.b, z17.b\n"
"zip2 z18.b, z18.b, z17.b\n"
+ "addvl x23, x23, #1\n"
"zip1 z17.b, z20.b, z16.b\n"
"zip2 z16.b, z20.b, z16.b\n"
- "st1b { z19.b }, p1, [x20]\n"
- "st1b { z18.b }, p1, [x20, #1, MUL VL]\n"
+ "st1b { z17.b }, p1, [x20]\n"
+ "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
"add x20, x20, %x[out_stride]\n"
+ "zip1 z17.b, z19.b, z18.b\n"
+ "zip2 z16.b, z19.b, z18.b\n"
"st1b { z17.b }, p1, [x20]\n"
"st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
"add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x24, 5f\n"
+ "cbz x22, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x24\n"
- "decw x24, ALL, MUL #2\n"
+ "whilelt p0.b, XZR, x22\n"
"ld1b { z18.b }, p0/Z, [x26]\n"
- "cmp x24, #0x0\n"
+ "decw x22, ALL, MUL #2\n"
+ "ld1b { z17.b }, p0/Z, [x25]\n"
+ "cmp x22, #0x0\n"
"incd x26, ALL, MUL #4\n"
- "ld1b { z19.b }, p0/Z, [x25]\n"
+ "ld1b { z16.b }, p0/Z, [x24]\n"
+ "zip1 z18.b, z18.b, z16.b\n"
"incd x25, ALL, MUL #4\n"
- "ld1b { z17.b }, p0/Z, [x23]\n"
+ "incd x24, ALL, MUL #4\n"
+ "ld1b { z16.b }, p0/Z, [x23]\n"
+ "zip1 z16.b, z17.b, z16.b\n"
"incd x23, ALL, MUL #4\n"
- "ld1b { z16.b }, p0/Z, [x21]\n"
- "incd x21, ALL, MUL #4\n"
- "zip1 z18.b, z18.b, z17.b\n"
- "zip1 z16.b, z19.b, z16.b\n"
"zip1 z17.b, z18.b, z16.b\n"
"zip2 z16.b, z18.b, z16.b\n"
"st1b { z17.b }, p1, [x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
index 651ae5f061..35d74e727b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
@@ -45,69 +45,69 @@ void sme_transpose_interleave_2VL_2x2(uint16_t *out, const uint16_t *in, size_t
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
- "add x24, x26, %x[in_stride]\n"
- "cnth x23, ALL, MUL #2\n"
- "add x21, x24, %x[in_stride]\n"
- "cmp x25, x23\n"
- "add x20, x21, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
+ "cnth x21, ALL, MUL #2\n"
+ "add x20, x24, %x[in_stride]\n"
+ "cmp x23, x21\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z18.h }, p1/Z, [x26]\n"
- "sub x25, x25, x23\n"
+ "ld1h { z17.h }, p1/Z, [x26]\n"
+ "sub x23, x23, x21\n"
+ "cmp x23, x21\n"
+ "ld1h { z16.h }, p1/Z, [x25]\n"
+ "zip1 z24.h, z17.h, z16.h\n"
+ "zip2 z23.h, z17.h, z16.h\n"
"ld1h { z17.h }, p1/Z, [x24]\n"
- "cmp x25, x23\n"
- "ld1h { z20.h }, p1/Z, [x21]\n"
"ld1h { z16.h }, p1/Z, [x20]\n"
- "ld1h { z23.h }, p1/Z, [x26, #1, MUL VL]\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z22.h, z18.h, z17.h\n"
+ "zip1 z22.h, z17.h, z16.h\n"
+ "zip2 z21.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p1/Z, [x26, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
+ "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ "zip1 z20.h, z17.h, z16.h\n"
+ "zip2 z19.h, z17.h, z16.h\n"
"ld1h { z18.h }, p1/Z, [x24, #1, MUL VL]\n"
"addvl x24, x24, #2\n"
- "ld1h { z21.h }, p1/Z, [x21, #1, MUL VL]\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z20.h, z20.h, z16.h\n"
- "addvl x21, x21, #2\n"
"ld1h { z16.h }, p1/Z, [x20, #1, MUL VL]\n"
+ "st1h { z24.h }, p1, [x22]\n"
"addvl x20, x20, #2\n"
- "st1h { z19.h }, p1, [x22]\n"
- "zip1 z19.h, z23.h, z18.h\n"
- "zip2 z18.h, z23.h, z18.h\n"
- "st1h { z22.h }, p1, [x22, #1, MUL VL]\n"
- "st1h { z17.h }, p1, [x22, #2, MUL VL]\n"
- "zip1 z17.h, z21.h, z16.h\n"
- "zip2 z16.h, z21.h, z16.h\n"
- "st1h { z20.h }, p1, [x22, #3, MUL VL]\n"
+ "zip1 z17.h, z18.h, z16.h\n"
+ "st1h { z23.h }, p1, [x22, #1, MUL VL]\n"
+ "zip2 z16.h, z18.h, z16.h\n"
+ "st1h { z22.h }, p1, [x22, #2, MUL VL]\n"
+ "st1h { z21.h }, p1, [x22, #3, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- "st1h { z19.h }, p1, [x22]\n"
- "st1h { z18.h }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z20.h }, p1, [x22]\n"
+ "st1h { z19.h }, p1, [x22, #1, MUL VL]\n"
"st1h { z17.h }, p1, [x22, #2, MUL VL]\n"
"st1h { z16.h }, p1, [x22, #3, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.h, XZR, x25\n"
- "decw x25, ALL, MUL #2\n"
- "ld1h { z18.h }, p0/Z, [x26]\n"
- "cmp x25, #0x0\n"
+ "whilelt p0.h, XZR, x23\n"
+ "ld1h { z17.h }, p0/Z, [x26]\n"
+ "decw x23, ALL, MUL #2\n"
+ "ld1h { z16.h }, p0/Z, [x25]\n"
+ "cmp x23, #0x0\n"
"addvl x26, x26, #1\n"
- "ld1h { z17.h }, p0/Z, [x24]\n"
+ "zip1 z20.h, z17.h, z16.h\n"
+ "ld1h { z19.h }, p0/Z, [x24]\n"
+ "addvl x25, x25, #1\n"
"addvl x24, x24, #1\n"
- "ld1h { z20.h }, p0/Z, [x21]\n"
- "addvl x21, x21, #1\n"
+ "zip2 z18.h, z17.h, z16.h\n"
"ld1h { z16.h }, p0/Z, [x20]\n"
"addvl x20, x20, #1\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z16.h, z20.h, z16.h\n"
- "st1h { z19.h }, p1, [x22]\n"
+ "zip1 z17.h, z19.h, z16.h\n"
+ "zip2 z16.h, z19.h, z16.h\n"
+ "st1h { z20.h }, p1, [x22]\n"
"st1h { z18.h }, p1, [x22, #1, MUL VL]\n"
"st1h { z17.h }, p1, [x22, #2, MUL VL]\n"
"st1h { z16.h }, p1, [x22, #3, MUL VL]\n"
@@ -121,12 +121,12 @@ void sme_transpose_interleave_2VL_2x2(uint16_t *out, const uint16_t *in, size_t
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "add x24, x26, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #2\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"cmp x21, x20\n"
"mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x2\n"
@@ -134,19 +134,19 @@ void sme_transpose_interleave_2VL_2x2(uint16_t *out, const uint16_t *in, size_t
"8:" // Tail row loop: Unroll column loop
"ld1h { z18.h }, p1/Z, [x26]\n"
"sub x21, x21, x20\n"
- "ld1h { z17.h }, p1/Z, [x24]\n"
"cmp x21, x20\n"
- "ld1h { z20.h }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x25]\n"
+ "zip1 z17.h, z18.h, z16.h\n"
+ "zip2 z19.h, z18.h, z16.h\n"
+ "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
- "ld1h { z16.h }, p1/Z, [x24, #1, MUL VL]\n"
- "addvl x24, x24, #2\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z16.h, z20.h, z16.h\n"
- "st1h { z19.h }, p1, [x22]\n"
- "st1h { z18.h }, p1, [x22, #1, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+ "st1h { z17.h }, p1, [x22]\n"
+ "addvl x25, x25, #2\n"
+ "zip1 z17.h, z18.h, z16.h\n"
+ "st1h { z19.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
+ "zip2 z16.h, z18.h, z16.h\n"
"st1h { z17.h }, p1, [x22]\n"
"st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
@@ -155,13 +155,13 @@ void sme_transpose_interleave_2VL_2x2(uint16_t *out, const uint16_t *in, size_t
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.h, XZR, x21\n"
- "decw x21, ALL, MUL #2\n"
"ld1h { z18.h }, p0/Z, [x26]\n"
+ "decw x21, ALL, MUL #2\n"
+ "ld1h { z16.h }, p0/Z, [x25]\n"
"cmp x21, #0x0\n"
"addvl x26, x26, #1\n"
- "ld1h { z16.h }, p0/Z, [x24]\n"
- "addvl x24, x24, #1\n"
"zip1 z17.h, z18.h, z16.h\n"
+ "addvl x25, x25, #1\n"
"zip2 z16.h, z18.h, z16.h\n"
"st1h { z17.h }, p1, [x22]\n"
"st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
index 382d4af314..284216a337 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
@@ -45,99 +45,99 @@ void sme_transpose_interleave_2VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
- "add x24, x26, %x[in_stride]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
"cnth x20, ALL, MUL #2\n"
- "add x23, x24, %x[in_stride]\n"
- "cmp x25, x20\n"
- "add x21, x23, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "add x21, x24, %x[in_stride]\n"
+ "cmp x23, x20\n"
"add %x[in], x21, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z19.s }, p2/Z, [x26]\n"
- "sub x25, x25, x20\n"
- "ld1w { z18.s }, p2/Z, [x26, #1, MUL VL]\n"
- "cmp x25, x20\n"
- "ld1w { z17.s }, p2/Z, [x23]\n"
- "ld1w { z16.s }, p2/Z, [x23, #1, MUL VL]\n"
- ".inst 0x658aaa7b // bfcvt z27.h, p2/M, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x26, #2, MUL VL]\n"
- ".inst 0x658aaa5a // bfcvt z26.h, p2/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x26, #3, MUL VL]\n"
- ".inst 0x658aaa39 // bfcvt z25.h, p2/M, z17.s\n"
- "addvl x26, x26, #4\n"
- "ld1w { z17.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x26]\n"
".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x23, #3, MUL VL]\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
- "addvl x23, x23, #4\n"
- "ld1w { z19.s }, p2/Z, [x24]\n"
- ".inst 0x658aaa56 // bfcvt z22.h, p2/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x24, #1, MUL VL]\n"
- ".inst 0x658aaa35 // bfcvt z21.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
+ "sub x23, x23, x20\n"
+ "cmp x23, x20\n"
+ "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, #2, MUL VL]\n"
".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, #1, MUL VL]\n"
- ".inst 0x648aaa7b // bfcvtnt z27.h, p2/M, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa5a // bfcvtnt z26.h, p2/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa39 // bfcvtnt z25.h, p2/M, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, #3, MUL VL]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "addvl x26, x26, #4\n"
+ "ld1w { z16.s }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, #3, MUL VL]\n"
+ ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
"addvl x24, x24, #4\n"
- "ld1w { z17.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x25]\n"
".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, #1, MUL VL]\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, #3, MUL VL]\n"
- "st1h { z27.h }, p2, [x22]\n"
+ "st1h { z24.h }, p2, [x22]\n"
"addvl x21, x21, #4\n"
- ".inst 0x648aaa77 // bfcvtnt z23.h, p2/M, z19.s\n"
- "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
- ".inst 0x648aaa56 // bfcvtnt z22.h, p2/M, z18.s\n"
- "st1h { z25.h }, p2, [x22, #2, MUL VL]\n"
- ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
- "st1h { z24.h }, p2, [x22, #3, MUL VL]\n"
+ ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
+ "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
- "st1h { z23.h }, p2, [x22]\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z21.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #3, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "mov x20, x25\n"
- "decw x25, ALL, MUL #2\n"
+ "mov x20, x23\n"
"whilelt p1.s, XZR, x20\n"
+ "ld1w { z16.s }, p1/Z, [x26]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"decw x20\n"
"whilelt p0.s, XZR, x20\n"
- "ld1w { z19.s }, p1/Z, [x26]\n"
- "cmp x25, #0x0\n"
- "ld1w { z18.s }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
+ "decw x23, ALL, MUL #2\n"
+ "cmp x23, #0x0\n"
+ "ld1w { z16.s }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
"addvl x26, x26, #2\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
- "ld1w { z16.s }, p0/Z, [x23, #1, MUL VL]\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
- "addvl x23, x23, #2\n"
- ".inst 0x658aaa56 // bfcvt z22.h, p2/M, z18.s\n"
- "ld1w { z21.s }, p1/Z, [x24]\n"
- ".inst 0x658aaa34 // bfcvt z20.h, p2/M, z17.s\n"
- "ld1w { z19.s }, p0/Z, [x24, #1, MUL VL]\n"
"addvl x24, x24, #2\n"
- ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x25]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
- ".inst 0x648aaab7 // bfcvtnt z23.h, p2/M, z21.s\n"
- ".inst 0x648aaa76 // bfcvtnt z22.h, p2/M, z19.s\n"
- ".inst 0x648aaa34 // bfcvtnt z20.h, p2/M, z17.s\n"
- ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
- "st1h { z23.h }, p2, [x22]\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z20.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+ ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #3, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
@@ -148,65 +148,65 @@ void sme_transpose_interleave_2VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "add x24, x26, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #2\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"cmp x21, x20\n"
"mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x2\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1w { z19.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p2/Z, [x26]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"sub x21, x21, x20\n"
- "ld1w { z18.s }, p2/Z, [x26, #1, MUL VL]\n"
"cmp x21, x20\n"
- "ld1w { z17.s }, p2/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, #2, MUL VL]\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
"ld1w { z16.s }, p2/Z, [x26, #3, MUL VL]\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
+ ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
"addvl x26, x26, #4\n"
- ".inst 0x658aaa56 // bfcvt z22.h, p2/M, z18.s\n"
- "ld1w { z21.s }, p2/Z, [x24]\n"
- "ld1w { z20.s }, p2/Z, [x24, #1, MUL VL]\n"
- ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
- ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x24, #3, MUL VL]\n"
- ".inst 0x648aaab7 // bfcvtnt z23.h, p2/M, z21.s\n"
- "addvl x24, x24, #4\n"
- ".inst 0x648aaa96 // bfcvtnt z22.h, p2/M, z20.s\n"
- ".inst 0x648aaa33 // bfcvtnt z19.h, p2/M, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x25]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, #2, MUL VL]\n"
".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
- "st1h { z23.h }, p2, [x22]\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x25, #3, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
+ "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
- "st1h { z19.h }, p2, [x22]\n"
- "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22]\n"
+ "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #2\n"
"whilelt p1.s, XZR, x20\n"
+ "ld1w { z16.s }, p1/Z, [x26]\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
"decw x20\n"
"whilelt p0.s, XZR, x20\n"
- "ld1w { z17.s }, p1/Z, [x26]\n"
- "cmp x21, #0x0\n"
"ld1w { z16.s }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25]\n"
+ "decw x21, ALL, MUL #2\n"
+ "cmp x21, #0x0\n"
+ ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
- "ld1w { z19.s }, p1/Z, [x24]\n"
- ".inst 0x658aaa32 // bfcvt z18.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x24, #1, MUL VL]\n"
- "addvl x24, x24, #2\n"
- ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
- ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
- ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
"st1h { z18.h }, p2, [x22]\n"
- "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
"add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
index 8d3aa59d13..9677ea2016 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
@@ -38,35 +38,35 @@ void sme_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"ptrue p4.b\n"
"blt 4f\n"
"1:" // Main row loop: Head
- "mov x27, %x[in]\n"
- "mov x26, %x[out]\n"
- "add x25, x27, %x[in_stride]\n"
- "sub %x[height], %x[height], #0x4\n"
+ "mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
- "mov x23, %x[width]\n"
- "add x22, x24, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x21, x23\n"
- "mov x20, x26\n"
- "whilelt p3.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p2.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z31.h }, p3/Z, [x27]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z30.h }, p2/Z, [x27, #1, MUL VL]\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x23, ALL, MUL #4\n"
- "ld1h { z29.h }, p1/Z, [x27, #2, MUL VL]\n"
- "ld1h { z28.h }, p0/Z, [x27, #3, MUL VL]\n"
- "cmp x23, #0x0\n"
- "addvl x27, x27, #4\n"
+ "mov x20, x21\n"
+ "whilelt p3.h, XZR, x20\n"
+ "ld1h { z31.h }, p3/Z, [x26]\n"
+ "dech x20\n"
+ "whilelt p2.h, XZR, x20\n"
+ "ld1h { z30.h }, p2/Z, [x26, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z29.h }, p1/Z, [x26, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z28.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "mov x20, x22\n"
+ "dech x21, ALL, MUL #4\n"
"ld1h { z27.h }, p3/Z, [x25]\n"
- "add x26, x26, %x[out_stride]\n"
"ld1h { z26.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "cmp x21, #0x0\n"
+ "addvl x26, x26, #4\n"
"ld1h { z25.h }, p1/Z, [x25, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"ld1h { z24.h }, p0/Z, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
"ld1h { z23.h }, p3/Z, [x24]\n"
@@ -74,12 +74,12 @@ void sme_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"ld1h { z21.h }, p1/Z, [x24, #2, MUL VL]\n"
"ld1h { z20.h }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- "ld1h { z19.h }, p3/Z, [x22]\n"
- "ld1h { z18.h }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1h { z17.h }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z19.h }, p3/Z, [x23]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
"st1h { z31.h }, p4, [x20]\n"
- "addvl x22, x22, #4\n"
+ "addvl x23, x23, #4\n"
"st1h { z30.h }, p4, [x20, #1, MUL VL]\n"
"st1h { z29.h }, p4, [x20, #2, MUL VL]\n"
"st1h { z28.h }, p4, [x20, #3, MUL VL]\n"
@@ -104,32 +104,32 @@ void sme_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"cbz %x[height], 8f\n"
"4:" // Main loop skip
"5:" // Tail row loop: Head
- "mov x27, %x[in]\n"
- "mov x26, %x[out]\n"
- "add %x[in], x27, %x[in_stride]\n"
+ "mov x26, %x[in]\n"
+ "add %x[in], x26, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x1\n"
"mov x21, %x[width]\n"
"6:" // Tail row loop: Column loop
"mov x20, x21\n"
- "dech x21, ALL, MUL #4\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z19.h }, p0/Z, [x26]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
"dech x20\n"
- "ld1h { z19.h }, p1/Z, [x27]\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x26, #2, MUL VL]\n"
"dech x20\n"
- "ld1h { z18.h }, p0/Z, [x27, #1, MUL VL]\n"
+ "dech x21, ALL, MUL #4\n"
"whilelt p0.h, XZR, x20\n"
"cmp x21, #0x0\n"
- "ld1h { z17.h }, p1/Z, [x27, #2, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1h { z19.h }, p4, [x26]\n"
- "st1h { z18.h }, p4, [x26, #1, MUL VL]\n"
- "st1h { z17.h }, p4, [x26, #2, MUL VL]\n"
- "st1h { z16.h }, p4, [x26, #3, MUL VL]\n"
- "add x26, x26, %x[out_stride]\n"
+ "ld1h { z16.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22]\n"
+ "addvl x26, x26, #4\n"
+ "st1h { z18.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z16.h }, p4, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 6b\n"
"7:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -139,7 +139,7 @@ void sme_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
".inst 0xd503467f // SMSTOP\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
@@ -147,19 +147,6 @@ void sme_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
template<>
void Transform<4, 1, true, VLType::SME>(
- double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
-{
- sme_transpose_interleave_4VL(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
- (xmax-x0) * sizeof(double) / 2,
- stride * sizeof(double),
- (kmax-k0)
- );
-}
-
-template<>
-void Transform<4, 1, true, VLType::SME>(
float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
{
sme_transpose_interleave_4VL(
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
index c7d4882b42..94d1c0840a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
@@ -43,43 +43,43 @@ void sme_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p1.b\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
- "cmp %x[height], #0x3\n"
"add x24, x25, %x[in_stride]\n"
- "mov x23, %x[out]\n"
- "add x22, x24, %x[in_stride]\n"
- "mov x21, %x[width]\n"
- "add x20, x22, %x[in_stride]\n"
- "csel x22, x22, %x[pad_row], GE\n"
- "add %x[in], x20, %x[in_stride]\n"
- "csel x20, x20, %x[pad_row], GT\n"
+ "add x23, x24, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "cmp %x[height], #0x3\n"
+ "add %x[in], x22, %x[in_stride]\n"
+ "csel x22, x22, %x[pad_row], GT\n"
+ "csel x23, x23, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[out]\n"
"csel x24, x24, %x[pad_row], GT\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x20, %x[width]\n"
"2:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x21\n"
- "decw x21, ALL, MUL #4\n"
- "ld1b { z20.b }, p0/Z, [x25]\n"
- "cmp x21, #0x0\n"
- "addvl x25, x25, #1\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ld1b { z17.b }, p0/Z, [x25]\n"
+ "decw x20, ALL, MUL #4\n"
"ld1b { z19.b }, p0/Z, [x24]\n"
+ "cmp x20, #0x0\n"
+ "addvl x25, x25, #1\n"
+ "ld1b { z16.b }, p0/Z, [x23]\n"
+ "zip1 z18.b, z17.b, z16.b\n"
+ "zip2 z20.b, z17.b, z16.b\n"
"addvl x24, x24, #1\n"
- "ld1b { z17.b }, p0/Z, [x22]\n"
- "addvl x22, x22, #1\n"
- "ld1b { z16.b }, p0/Z, [x20]\n"
- "addvl x20, x20, #1\n"
- "zip1 z18.b, z20.b, z17.b\n"
- "zip2 z20.b, z20.b, z17.b\n"
+ "ld1b { z16.b }, p0/Z, [x22]\n"
"zip1 z17.b, z19.b, z16.b\n"
- "zip2 z16.b, z19.b, z16.b\n"
- "zip1 z19.b, z18.b, z17.b\n"
+ "zip2 z19.b, z19.b, z16.b\n"
+ "addvl x23, x23, #1\n"
+ "addvl x22, x22, #1\n"
+ "zip1 z16.b, z18.b, z17.b\n"
"zip2 z18.b, z18.b, z17.b\n"
- "zip1 z17.b, z20.b, z16.b\n"
- "zip2 z16.b, z20.b, z16.b\n"
- "st1b { z19.b }, p1, [x23]\n"
- "st1b { z18.b }, p1, [x23, #1, MUL VL]\n"
- "st1b { z17.b }, p1, [x23, #2, MUL VL]\n"
- "st1b { z16.b }, p1, [x23, #3, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
+ "st1b { z16.b }, p1, [x21]\n"
+ "zip1 z17.b, z20.b, z19.b\n"
+ "zip2 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p1, [x21, #1, MUL VL]\n"
+ "st1b { z17.b }, p1, [x21, #2, MUL VL]\n"
+ "st1b { z16.b }, p1, [x21, #3, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
index f070d7d322..4327466387 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
@@ -45,49 +45,49 @@ void sme_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"blt 4f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[out]\n"
- "add x24, x26, %x[in_stride]\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "mov x22, %x[width]\n"
- "add x21, x23, %x[in_stride]\n"
- "add %x[in], x21, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x20, x22\n"
- "decw x22, ALL, MUL #4\n"
+ "mov x20, x21\n"
"whilelt p1.h, XZR, x20\n"
+ "ld1h { z19.h }, p1/Z, [x26]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
- "ld1h { z20.h }, p1/Z, [x26]\n"
- "cmp x22, #0x0\n"
- "ld1h { z19.h }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z17.h }, p1/Z, [x25]\n"
+ "decw x21, ALL, MUL #4\n"
+ "cmp x21, #0x0\n"
+ "zip1 z24.h, z19.h, z17.h\n"
+ "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
+ "addvl x25, x25, #2\n"
+ "zip2 z23.h, z19.h, z17.h\n"
"ld1h { z17.h }, p1/Z, [x24]\n"
- "ld1h { z16.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "zip1 z22.h, z18.h, z16.h\n"
+ "zip2 z21.h, z18.h, z16.h\n"
+ "ld1h { z20.h }, p0/Z, [x24, #1, MUL VL]\n"
"addvl x24, x24, #2\n"
- "ld1h { z18.h }, p1/Z, [x23]\n"
- "ld1h { z24.h }, p0/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x23]\n"
+ "zip1 z19.h, z17.h, z16.h\n"
+ "zip2 z18.h, z17.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n"
"addvl x23, x23, #2\n"
- "zip1 z23.h, z20.h, z17.h\n"
- "zip2 z22.h, z20.h, z17.h\n"
- "ld1h { z17.h }, p1/Z, [x21]\n"
- "zip1 z21.h, z19.h, z16.h\n"
- "zip2 z20.h, z19.h, z16.h\n"
- "ld1h { z16.h }, p0/Z, [x21, #1, MUL VL]\n"
- "addvl x21, x21, #2\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "st1h { z23.h }, p2, [x25]\n"
- "zip1 z17.h, z24.h, z16.h\n"
- "zip2 z16.h, z24.h, z16.h\n"
- "st1h { z22.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z21.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z20.h }, p2, [x25, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x25, #4, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #5, MUL VL]\n"
- "st1h { z17.h }, p2, [x25, #6, MUL VL]\n"
- "st1h { z16.h }, p2, [x25, #7, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "zip1 z17.h, z20.h, z16.h\n"
+ "zip2 z16.h, z20.h, z16.h\n"
+ "st1h { z24.h }, p2, [x22]\n"
+ "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -97,35 +97,35 @@ void sme_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"4:" // Main loop skip
"5:" // Tail row loop: Head
"mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "add x24, x26, %x[in_stride]\n"
- "mov x25, %x[out]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"sub %x[height], %x[height], #0x2\n"
"mov x21, %x[width]\n"
"6:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #4\n"
"whilelt p1.h, XZR, x20\n"
+ "ld1h { z18.h }, p1/Z, [x26]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
- "ld1h { z18.h }, p1/Z, [x26]\n"
- "cmp x21, #0x0\n"
"ld1h { z20.h }, p0/Z, [x26, #1, MUL VL]\n"
- "addvl x26, x26, #2\n"
- "ld1h { z17.h }, p1/Z, [x24]\n"
- "ld1h { z16.h }, p0/Z, [x24, #1, MUL VL]\n"
- "addvl x24, x24, #2\n"
+ "ld1h { z17.h }, p1/Z, [x25]\n"
+ "decw x21, ALL, MUL #4\n"
+ "cmp x21, #0x0\n"
"zip1 z19.h, z18.h, z17.h\n"
+ "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "addvl x26, x26, #2\n"
+ "addvl x25, x25, #2\n"
"zip2 z18.h, z18.h, z17.h\n"
"zip1 z17.h, z20.h, z16.h\n"
"zip2 z16.h, z20.h, z16.h\n"
- "st1h { z19.h }, p2, [x25]\n"
- "st1h { z18.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z17.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z16.h }, p2, [x25, #3, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "st1h { z19.h }, p2, [x22]\n"
+ "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 6b\n"
"7:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
index 44305f0513..1c97bed317 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
@@ -45,69 +45,69 @@ void sme_transpose_interleave_4VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"blt 4f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[out]\n"
- "add x24, x26, %x[in_stride]\n"
- "sub %x[height], %x[height], #0x4\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "mov x22, %x[width]\n"
- "add x21, x23, %x[in_stride]\n"
- "add %x[in], x21, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x20, x22\n"
- "decw x22, ALL, MUL #4\n"
+ "mov x20, x21\n"
"whilelt p3.s, XZR, x20\n"
+ "ld1w { z16.s }, p3/Z, [x26]\n"
+ ".inst 0x658ab218 // bfcvt z24.h, p4/M, z16.s\n"
"decw x20\n"
"whilelt p2.s, XZR, x20\n"
+ "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658ab217 // bfcvt z23.h, p4/M, z16.s\n"
"decw x20\n"
- "ld1w { z19.s }, p3/Z, [x26]\n"
"whilelt p1.s, XZR, x20\n"
+ "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+ ".inst 0x658ab216 // bfcvt z22.h, p4/M, z16.s\n"
"decw x20\n"
- "ld1w { z18.s }, p2/Z, [x26, #1, MUL VL]\n"
"whilelt p0.s, XZR, x20\n"
- "ld1w { z17.s }, p1/Z, [x26, #2, MUL VL]\n"
- "cmp x22, #0x0\n"
"ld1w { z16.s }, p0/Z, [x26, #3, MUL VL]\n"
- ".inst 0x658ab27b // bfcvt z27.h, p4/M, z19.s\n"
- "addvl x26, x26, #4\n"
- "ld1w { z19.s }, p3/Z, [x23]\n"
- ".inst 0x658ab25a // bfcvt z26.h, p4/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x23, #1, MUL VL]\n"
- ".inst 0x658ab239 // bfcvt z25.h, p4/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0x658ab218 // bfcvt z24.h, p4/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x23, #3, MUL VL]\n"
- ".inst 0x658ab277 // bfcvt z23.h, p4/M, z19.s\n"
- "addvl x23, x23, #4\n"
- ".inst 0x658ab256 // bfcvt z22.h, p4/M, z18.s\n"
- "ld1w { z19.s }, p3/Z, [x24]\n"
- ".inst 0x658ab235 // bfcvt z21.h, p4/M, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x658ab215 // bfcvt z21.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x24]\n"
".inst 0x658ab214 // bfcvt z20.h, p4/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x24, #2, MUL VL]\n"
+ "decw x21, ALL, MUL #4\n"
+ "cmp x21, #0x0\n"
+ "ld1w { z16.s }, p2/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x658ab213 // bfcvt z19.h, p4/M, z16.s\n"
+ "addvl x26, x26, #4\n"
+ "ld1w { z16.s }, p1/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x658ab212 // bfcvt z18.h, p4/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x24, #3, MUL VL]\n"
+ ".inst 0x658ab211 // bfcvt z17.h, p4/M, z16.s\n"
"addvl x24, x24, #4\n"
- ".inst 0x648ab27b // bfcvtnt z27.h, p4/M, z19.s\n"
- "ld1w { z19.s }, p3/Z, [x21]\n"
- ".inst 0x648ab25a // bfcvtnt z26.h, p4/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x21, #1, MUL VL]\n"
- ".inst 0x648ab239 // bfcvtnt z25.h, p4/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x25]\n"
".inst 0x648ab218 // bfcvtnt z24.h, p4/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
- ".inst 0x648ab277 // bfcvtnt z23.h, p4/M, z19.s\n"
- "st1h { z27.h }, p4, [x25]\n"
- ".inst 0x648ab256 // bfcvtnt z22.h, p4/M, z18.s\n"
- "st1h { z26.h }, p4, [x25, #1, MUL VL]\n"
- ".inst 0x648ab235 // bfcvtnt z21.h, p4/M, z17.s\n"
- "st1h { z25.h }, p4, [x25, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x648ab217 // bfcvtnt z23.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648ab216 // bfcvtnt z22.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0x648ab215 // bfcvtnt z21.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x23]\n"
".inst 0x648ab214 // bfcvtnt z20.h, p4/M, z16.s\n"
- "st1h { z24.h }, p4, [x25, #3, MUL VL]\n"
- "st1h { z23.h }, p4, [x25, #4, MUL VL]\n"
- "st1h { z22.h }, p4, [x25, #5, MUL VL]\n"
- "st1h { z21.h }, p4, [x25, #6, MUL VL]\n"
- "st1h { z20.h }, p4, [x25, #7, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "ld1w { z16.s }, p2/Z, [x23, #1, MUL VL]\n"
+ ".inst 0x648ab213 // bfcvtnt z19.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x648ab212 // bfcvtnt z18.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ ".inst 0x648ab211 // bfcvtnt z17.h, p4/M, z16.s\n"
+ "st1h { z24.h }, p4, [x22]\n"
+ "st1h { z23.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z21.h }, p4, [x22, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22, #4, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22, #5, MUL VL]\n"
+ "st1h { z18.h }, p4, [x22, #6, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -117,47 +117,47 @@ void sme_transpose_interleave_4VL_2x2_fp32bf16(bfloat16 *out, const float *in, s
"4:" // Main loop skip
"5:" // Tail row loop: Head
"mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "add x24, x26, %x[in_stride]\n"
- "mov x25, %x[out]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"sub %x[height], %x[height], #0x2\n"
"mov x21, %x[width]\n"
"6:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #4\n"
"whilelt p3.s, XZR, x20\n"
+ "ld1w { z16.s }, p3/Z, [x26]\n"
+ ".inst 0x658ab214 // bfcvt z20.h, p4/M, z16.s\n"
"decw x20\n"
"whilelt p2.s, XZR, x20\n"
+ "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x658ab213 // bfcvt z19.h, p4/M, z16.s\n"
"decw x20\n"
- "ld1w { z19.s }, p3/Z, [x26]\n"
"whilelt p1.s, XZR, x20\n"
+ "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+ ".inst 0x658ab212 // bfcvt z18.h, p4/M, z16.s\n"
"decw x20\n"
- "ld1w { z18.s }, p2/Z, [x26, #1, MUL VL]\n"
"whilelt p0.s, XZR, x20\n"
- "ld1w { z17.s }, p1/Z, [x26, #2, MUL VL]\n"
- "cmp x21, #0x0\n"
"ld1w { z16.s }, p0/Z, [x26, #3, MUL VL]\n"
- ".inst 0x658ab277 // bfcvt z23.h, p4/M, z19.s\n"
+ ".inst 0x658ab211 // bfcvt z17.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x25]\n"
+ "decw x21, ALL, MUL #4\n"
+ "cmp x21, #0x0\n"
+ ".inst 0x648ab214 // bfcvtnt z20.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0x658ab256 // bfcvt z22.h, p4/M, z18.s\n"
- "ld1w { z21.s }, p3/Z, [x24]\n"
- ".inst 0x658ab234 // bfcvt z20.h, p4/M, z17.s\n"
- "ld1w { z19.s }, p2/Z, [x24, #1, MUL VL]\n"
- ".inst 0x658ab212 // bfcvt z18.h, p4/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z16.s }, p0/Z, [x24, #3, MUL VL]\n"
- "addvl x24, x24, #4\n"
- ".inst 0x648ab2b7 // bfcvtnt z23.h, p4/M, z21.s\n"
- ".inst 0x648ab276 // bfcvtnt z22.h, p4/M, z19.s\n"
- ".inst 0x648ab234 // bfcvtnt z20.h, p4/M, z17.s\n"
+ ".inst 0x648ab213 // bfcvtnt z19.h, p4/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
".inst 0x648ab212 // bfcvtnt z18.h, p4/M, z16.s\n"
- "st1h { z23.h }, p4, [x25]\n"
- "st1h { z22.h }, p4, [x25, #1, MUL VL]\n"
- "st1h { z20.h }, p4, [x25, #2, MUL VL]\n"
- "st1h { z18.h }, p4, [x25, #3, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "ld1w { z16.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0x648ab211 // bfcvtnt z17.h, p4/M, z16.s\n"
+ "st1h { z20.h }, p4, [x22]\n"
+ "st1h { z19.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z18.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 6b\n"
"7:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
index c49c5ba433..98e8bb20a5 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
@@ -39,51 +39,51 @@ void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t widt
"blt 4f\n"
"1:" // Main row loop: Head
"mov x25, %x[in]\n"
- "mov x24, %x[out]\n"
- "add x23, x25, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add %x[in], x24, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
"sub %x[height], %x[height], #0x2\n"
- "add %x[in], x23, %x[in_stride]\n"
"mov x22, %x[width]\n"
"2:" // Main row loop: Column loop
"mov x21, x22\n"
- "mov x20, x24\n"
"whilelt p0.h, XZR, x21\n"
+ "ld1h { z31.h }, p0/Z, [x25]\n"
"dech x21\n"
"whilelt p6.h, XZR, x21\n"
+ "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n"
"dech x21\n"
- "ld1h { z31.h }, p0/Z, [x25]\n"
"whilelt p5.h, XZR, x21\n"
+ "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n"
"dech x21\n"
- "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n"
"whilelt p4.h, XZR, x21\n"
+ "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n"
"dech x21\n"
- "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n"
"whilelt p3.h, XZR, x21\n"
+ "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n"
"dech x21\n"
- "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n"
"whilelt p2.h, XZR, x21\n"
+ "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n"
"dech x21\n"
- "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n"
"whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n"
- "dech x22, ALL, MUL #8\n"
"ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
- "ld1h { z24.h }, p0/Z, [x23]\n"
+ "dech x21\n"
+ "mov x20, x23\n"
+ "ld1h { z24.h }, p0/Z, [x24]\n"
"whilelt p0.h, XZR, x21\n"
- "cmp x22, #0x0\n"
+ "dech x22, ALL, MUL #8\n"
"ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n"
+ "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n"
+ "cmp x22, #0x0\n"
"addvl x25, x25, #8\n"
- "ld1h { z22.h }, p6/Z, [x23, #1, MUL VL]\n"
- "ld1h { z21.h }, p5/Z, [x23, #2, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23, #3, MUL VL]\n"
- "ld1h { z19.h }, p3/Z, [x23, #4, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x23, #5, MUL VL]\n"
- "ld1h { z17.h }, p1/Z, [x23, #6, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x23, #7, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
+ "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
"st1h { z31.h }, p7, [x20]\n"
- "addvl x23, x23, #8\n"
+ "addvl x24, x24, #8\n"
"st1h { z30.h }, p7, [x20, #1, MUL VL]\n"
"st1h { z29.h }, p7, [x20, #2, MUL VL]\n"
"st1h { z28.h }, p7, [x20, #3, MUL VL]\n"
@@ -109,47 +109,47 @@ void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t widt
"4:" // Main loop skip
"5:" // Tail row loop: Head
"mov x25, %x[in]\n"
- "mov x24, %x[out]\n"
"add %x[in], x25, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
"sub %x[height], %x[height], #0x1\n"
"mov x21, %x[width]\n"
"6:" // Tail row loop: Column loop
"mov x20, x21\n"
- "dech x21, ALL, MUL #8\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z23.h }, p0/Z, [x25]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n"
"dech x20\n"
- "ld1h { z23.h }, p1/Z, [x25]\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n"
"dech x20\n"
- "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n"
"dech x20\n"
- "ld1h { z21.h }, p1/Z, [x25, #2, MUL VL]\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n"
"dech x20\n"
- "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n"
"dech x20\n"
- "ld1h { z19.h }, p1/Z, [x25, #4, MUL VL]\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n"
"dech x20\n"
- "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n"
+ "dech x21, ALL, MUL #8\n"
"whilelt p0.h, XZR, x20\n"
"cmp x21, #0x0\n"
- "ld1h { z17.h }, p1/Z, [x25, #6, MUL VL]\n"
"ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n"
+ "st1h { z23.h }, p7, [x23]\n"
"addvl x25, x25, #8\n"
- "st1h { z23.h }, p7, [x24]\n"
- "st1h { z22.h }, p7, [x24, #1, MUL VL]\n"
- "st1h { z21.h }, p7, [x24, #2, MUL VL]\n"
- "st1h { z20.h }, p7, [x24, #3, MUL VL]\n"
- "st1h { z19.h }, p7, [x24, #4, MUL VL]\n"
- "st1h { z18.h }, p7, [x24, #5, MUL VL]\n"
- "st1h { z17.h }, p7, [x24, #6, MUL VL]\n"
- "st1h { z16.h }, p7, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ "st1h { z22.h }, p7, [x23, #1, MUL VL]\n"
+ "st1h { z21.h }, p7, [x23, #2, MUL VL]\n"
+ "st1h { z20.h }, p7, [x23, #3, MUL VL]\n"
+ "st1h { z19.h }, p7, [x23, #4, MUL VL]\n"
+ "st1h { z18.h }, p7, [x23, #5, MUL VL]\n"
+ "st1h { z17.h }, p7, [x23, #6, MUL VL]\n"
+ "st1h { z16.h }, p7, [x23, #7, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
"bgt 6b\n"
"7:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
index 30a9dc3e9c..5d9c123835 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
@@ -43,62 +43,62 @@ void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p2.b\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "cmp %x[height], #0x3\n"
"add x25, x26, %x[in_stride]\n"
- "mov x24, %x[out]\n"
- "add x23, x25, %x[in_stride]\n"
- "mov x22, %x[width]\n"
- "add x21, x23, %x[in_stride]\n"
- "csel x23, x23, %x[pad_row], GE\n"
- "add %x[in], x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GT\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x3\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "sub %x[height], %x[height], #0x4\n"
+ "mov x22, %x[out]\n"
"csel x25, x25, %x[pad_row], GT\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x20, x22\n"
- "decw x22, ALL, MUL #8\n"
+ "mov x20, x21\n"
"whilelt p1.b, XZR, x20\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
"decb x20\n"
"whilelt p0.b, XZR, x20\n"
- "ld1b { z20.b }, p1/Z, [x26]\n"
- "cmp x22, #0x0\n"
- "ld1b { z24.b }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
+ "decw x21, ALL, MUL #8\n"
+ "cmp x21, #0x0\n"
+ "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n"
"addvl x26, x26, #2\n"
- "ld1b { z23.b }, p1/Z, [x25]\n"
- "ld1b { z22.b }, p0/Z, [x25, #1, MUL VL]\n"
"addvl x25, x25, #2\n"
- "ld1b { z19.b }, p1/Z, [x23]\n"
- "ld1b { z18.b }, p0/Z, [x23, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x24]\n"
+ "zip1 z24.b, z19.b, z16.b\n"
+ "zip2 z20.b, z19.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n"
+ "zip1 z23.b, z17.b, z16.b\n"
+ "zip2 z22.b, z17.b, z16.b\n"
+ "addvl x24, x24, #2\n"
+ "ld1b { z16.b }, p1/Z, [x23]\n"
+ "zip1 z17.b, z18.b, z16.b\n"
+ "zip2 z19.b, z18.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n"
+ "zip1 z18.b, z21.b, z16.b\n"
+ "zip2 z21.b, z21.b, z16.b\n"
"addvl x23, x23, #2\n"
- "ld1b { z17.b }, p1/Z, [x21]\n"
- "ld1b { z16.b }, p0/Z, [x21, #1, MUL VL]\n"
- "zip1 z21.b, z20.b, z19.b\n"
+ "zip1 z16.b, z24.b, z17.b\n"
+ "zip2 z17.b, z24.b, z17.b\n"
+ "st1b { z16.b }, p2, [x22]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
"zip2 z20.b, z20.b, z19.b\n"
- "addvl x21, x21, #2\n"
- "zip1 z25.b, z24.b, z18.b\n"
- "zip2 z24.b, z24.b, z18.b\n"
- "zip1 z19.b, z23.b, z17.b\n"
- "zip2 z18.b, z23.b, z17.b\n"
- "zip1 z17.b, z22.b, z16.b\n"
- "zip2 z16.b, z22.b, z16.b\n"
- "zip1 z23.b, z21.b, z19.b\n"
- "zip2 z22.b, z21.b, z19.b\n"
- "zip1 z21.b, z20.b, z18.b\n"
- "zip2 z20.b, z20.b, z18.b\n"
- "zip1 z19.b, z25.b, z17.b\n"
- "zip2 z18.b, z25.b, z17.b\n"
- "zip1 z17.b, z24.b, z16.b\n"
- "zip2 z16.b, z24.b, z16.b\n"
- "st1b { z23.b }, p2, [x24]\n"
- "st1b { z22.b }, p2, [x24, #1, MUL VL]\n"
- "st1b { z21.b }, p2, [x24, #2, MUL VL]\n"
- "st1b { z20.b }, p2, [x24, #3, MUL VL]\n"
- "st1b { z19.b }, p2, [x24, #4, MUL VL]\n"
- "st1b { z18.b }, p2, [x24, #5, MUL VL]\n"
- "st1b { z17.b }, p2, [x24, #6, MUL VL]\n"
- "st1b { z16.b }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ "st1b { z17.b }, p2, [x22, #1, MUL VL]\n"
+ "zip1 z19.b, z23.b, z18.b\n"
+ "zip2 z18.b, z23.b, z18.b\n"
+ "st1b { z16.b }, p2, [x22, #2, MUL VL]\n"
+ "zip1 z17.b, z22.b, z21.b\n"
+ "zip2 z16.b, z22.b, z21.b\n"
+ "st1b { z20.b }, p2, [x22, #3, MUL VL]\n"
+ "st1b { z19.b }, p2, [x22, #4, MUL VL]\n"
+ "st1b { z18.b }, p2, [x22, #5, MUL VL]\n"
+ "st1b { z17.b }, p2, [x22, #6, MUL VL]\n"
+ "st1b { z16.b }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
index 75bc57a649..9e1b2dca3e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
@@ -43,40 +43,40 @@ void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t
"ptrue p4.b\n"
"1:" // Main row loop: Head
"mov x24, %x[in]\n"
- "cmp %x[height], #0x1\n"
"add x23, x24, %x[in_stride]\n"
- "mov x22, %x[out]\n"
+ "cmp %x[height], #0x1\n"
"add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"csel x23, x23, %x[pad_row], GT\n"
"sub %x[height], %x[height], #0x2\n"
"mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #8\n"
"whilelt p3.h, XZR, x20\n"
+ "ld1h { z20.h }, p3/Z, [x24]\n"
"dech x20\n"
"whilelt p2.h, XZR, x20\n"
+ "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
"dech x20\n"
- "ld1h { z21.h }, p3/Z, [x24]\n"
"whilelt p1.h, XZR, x20\n"
+ "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
"dech x20\n"
- "ld1h { z20.h }, p2/Z, [x24, #1, MUL VL]\n"
"whilelt p0.h, XZR, x20\n"
- "ld1h { z25.h }, p1/Z, [x24, #2, MUL VL]\n"
- "cmp x21, #0x0\n"
"ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z17.h }, p3/Z, [x23]\n"
+ "decw x21, ALL, MUL #8\n"
+ "cmp x21, #0x0\n"
+ "zip1 z23.h, z20.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n"
"addvl x24, x24, #4\n"
- "ld1h { z19.h }, p3/Z, [x23]\n"
- "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "zip2 z22.h, z20.h, z17.h\n"
+ "zip1 z21.h, z19.h, z16.h\n"
"ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "zip2 z20.h, z19.h, z16.h\n"
+ "zip1 z19.h, z18.h, z17.h\n"
"ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
- "zip1 z23.h, z21.h, z19.h\n"
- "zip2 z22.h, z21.h, z19.h\n"
- "zip1 z21.h, z20.h, z18.h\n"
- "zip2 z20.h, z20.h, z18.h\n"
- "zip1 z19.h, z25.h, z17.h\n"
- "zip2 z18.h, z25.h, z17.h\n"
+ "zip2 z18.h, z18.h, z17.h\n"
"zip1 z17.h, z24.h, z16.h\n"
"zip2 z16.h, z24.h, z16.h\n"
"st1h { z23.h }, p4, [x22]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
index 86dcddd07b..7e9b40b0d0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
@@ -39,314 +39,314 @@ void sve_transpose_interleave_12VL_2x4_fp32bf16(bfloat16 *out, const float *in,
size_t out_stride = 12 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
__asm__ __volatile__(
- "ptrue p2.b\n"
+ "ptrue p6.b\n"
"1:" // Main row loop: Head
"mov x28, %x[in]\n"
- "mov x27, %x[width]\n"
- "cnth x26, ALL, MUL #6\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
+ "mov x25, %x[width]\n"
+ "cnth x24, ALL, MUL #6\n"
+ "add x23, x26, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x25, %x[out]\n"
- "add x24, x28, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
- "csel x22, x22, %x[pad_row], GT\n"
- "csel x23, x23, %x[pad_row], GE\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x26, x26, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "cmp x27, x26\n"
+ "csel x27, x27, %x[pad_row], GT\n"
+ "cmp x25, x24\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z16.s }, p2/Z, [x28]\n"
- "ld1w { z22.s }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x21, x25\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1w { z30.s }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x28, #3, MUL VL]\n"
- "mov x20, x25\n"
- "sub x27, x27, x26\n"
- "ld1w { z23.s }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x28, #5, MUL VL]\n"
- "cmp x27, x26\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1w { z17.s }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1w { z0.s }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1w { z22.s }, p6/Z, [x28]\n"
+ "ld1w { z7.s }, p6/Z, [x28, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1w { z19.s }, p6/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z18.s }, p6/Z, [x28, #3, MUL VL]\n"
+ "mov x20, x22\n"
+ "sub x25, x25, x24\n"
+ "ld1w { z5.s }, p6/Z, [x28, #4, MUL VL]\n"
+ "ld1w { z25.s }, p6/Z, [x28, #5, MUL VL]\n"
+ "cmp x25, x24\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1w { z20.s }, p6/Z, [x28, #6, MUL VL]\n"
+ "ld1w { z23.s }, p6/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #12\n"
- "ld1w { z10.s }, p2/Z, [x23]\n"
- "ld1w { z14.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z12.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x23, #3, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [x23, #4, MUL VL]\n"
- "ld1w { z31.s }, p2/Z, [x23, #5, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #6, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x23, #7, MUL VL]\n"
+ "ld1w { z4.s }, p6/Z, [x26]\n"
+ "ld1w { z10.s }, p6/Z, [x26, #1, MUL VL]\n"
+ "zip1 z14.s, z22.s, z4.s\n"
+ "zip2 z22.s, z22.s, z4.s\n"
+ "ld1w { z28.s }, p6/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z27.s }, p6/Z, [x26, #3, MUL VL]\n"
+ "zip1 z24.s, z7.s, z10.s\n"
+ "zip2 z15.s, z7.s, z10.s\n"
+ "ld1w { z7.s }, p6/Z, [x26, #4, MUL VL]\n"
+ "ld1w { z2.s }, p6/Z, [x26, #5, MUL VL]\n"
+ "zip1 z9.s, z19.s, z28.s\n"
+ "zip2 z0.s, z19.s, z28.s\n"
+ "ld1w { z19.s }, p6/Z, [x26, #6, MUL VL]\n"
+ "ld1w { z16.s }, p6/Z, [x26, #7, MUL VL]\n"
+ "addvl x26, x26, #12\n"
+ "zip1 z1.s, z18.s, z27.s\n"
+ "ld1w { z30.s }, p6/Z, [x28, #-4, MUL VL]\n"
+ "ld1w { z29.s }, p6/Z, [x28, #-3, MUL VL]\n"
+ "zip2 z17.s, z18.s, z27.s\n"
+ ".inst 0x658ab9d5 // bfcvt z21.h, p6/M, z14.s\n"
+ "ld1w { z31.s }, p6/Z, [x27]\n"
+ "ld1w { z8.s }, p6/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x658abacc // bfcvt z12.h, p6/M, z22.s\n"
+ ".inst 0x658abb0e // bfcvt z14.h, p6/M, z24.s\n"
+ "ld1w { z22.s }, p6/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z28.s }, p6/Z, [x27, #3, MUL VL]\n"
+ ".inst 0x658ab9ea // bfcvt z10.h, p6/M, z15.s\n"
+ ".inst 0x658ab92f // bfcvt z15.h, p6/M, z9.s\n"
+ "ld1w { z27.s }, p6/Z, [x27, #4, MUL VL]\n"
+ "ld1w { z13.s }, p6/Z, [x27, #5, MUL VL]\n"
+ ".inst 0x658ab803 // bfcvt z3.h, p6/M, z0.s\n"
+ ".inst 0x658ab832 // bfcvt z18.h, p6/M, z1.s\n"
+ "ld1w { z26.s }, p6/Z, [x27, #6, MUL VL]\n"
+ "ld1w { z9.s }, p6/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #12\n"
+ ".inst 0x658aba26 // bfcvt z6.h, p6/M, z17.s\n"
+ "ld1w { z1.s }, p6/Z, [x26, #-4, MUL VL]\n"
+ "ld1w { z0.s }, p6/Z, [x26, #-3, MUL VL]\n"
+ "zip1 z17.s, z5.s, z7.s\n"
+ "zip2 z5.s, z5.s, z7.s\n"
+ "ld1w { z24.s }, p6/Z, [x23]\n"
+ "ld1w { z11.s }, p6/Z, [x23, #1, MUL VL]\n"
+ "zip1 z7.s, z31.s, z24.s\n"
+ "zip2 z31.s, z31.s, z24.s\n"
+ "ld1w { z4.s }, p6/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z24.s }, p6/Z, [x23, #3, MUL VL]\n"
+ ".inst 0x648ab8f5 // bfcvtnt z21.h, p6/M, z7.s\n"
+ "zip1 z7.s, z8.s, z11.s\n"
+ "zip2 z11.s, z8.s, z11.s\n"
+ "ld1w { z8.s }, p6/Z, [x23, #4, MUL VL]\n"
+ ".inst 0x648abbec // bfcvtnt z12.h, p6/M, z31.s\n"
+ "ld1w { z31.s }, p6/Z, [x23, #5, MUL VL]\n"
+ ".inst 0x648ab8ee // bfcvtnt z14.h, p6/M, z7.s\n"
+ "ld1w { z7.s }, p6/Z, [x23, #6, MUL VL]\n"
+ ".inst 0x648ab96a // bfcvtnt z10.h, p6/M, z11.s\n"
+ "zip1 z11.s, z22.s, z4.s\n"
+ "zip2 z4.s, z22.s, z4.s\n"
+ "ld1w { z22.s }, p6/Z, [x23, #7, MUL VL]\n"
"addvl x23, x23, #12\n"
- "zip1 z26.s, z16.s, z10.s\n"
- "ld1w { z2.s }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x28, #-3, MUL VL]\n"
- "zip2 z15.s, z16.s, z10.s\n"
- "zip1 z6.s, z22.s, z14.s\n"
- "ld1w { z27.s }, p2/Z, [x24]\n"
- "ld1w { z18.s }, p2/Z, [x24, #1, MUL VL]\n"
- "zip2 z28.s, z22.s, z14.s\n"
- "zip1 z25.s, z30.s, z12.s\n"
- "ld1w { z21.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x24, #3, MUL VL]\n"
- "zip2 z7.s, z30.s, z12.s\n"
- "zip1 z9.s, z11.s, z13.s\n"
- "ld1w { z4.s }, p2/Z, [x24, #4, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x24, #5, MUL VL]\n"
- "zip2 z16.s, z11.s, z13.s\n"
- ".inst 0x658aab4c // bfcvt z12.h, p2/M, z26.s\n"
- "ld1w { z14.s }, p2/Z, [x24, #6, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x24, #7, MUL VL]\n"
- "addvl x24, x24, #12\n"
- ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
- "ld1w { z26.s }, p2/Z, [x23, #-4, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x23, #-3, MUL VL]\n"
- ".inst 0x658aa8cd // bfcvt z13.h, p2/M, z6.s\n"
- ".inst 0x658aab8a // bfcvt z10.h, p2/M, z28.s\n"
- "ld1w { z28.s }, p2/Z, [x22]\n"
- "ld1w { z8.s }, p2/Z, [x22, #1, MUL VL]\n"
- ".inst 0x658aab39 // bfcvt z25.h, p2/M, z25.s\n"
- ".inst 0x658aa8e6 // bfcvt z6.h, p2/M, z7.s\n"
- "ld1w { z11.s }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0x658aa927 // bfcvt z7.h, p2/M, z9.s\n"
- ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
- "zip1 z9.s, z23.s, z29.s\n"
- "zip2 z23.s, z23.s, z29.s\n"
- "zip1 z29.s, z27.s, z28.s\n"
- "zip2 z27.s, z27.s, z28.s\n"
- "ld1w { z28.s }, p2/Z, [x22, #3, MUL VL]\n"
- ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
- ".inst 0x658aaaf7 // bfcvt z23.h, p2/M, z23.s\n"
- ".inst 0x648aabac // bfcvtnt z12.h, p2/M, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x22, #4, MUL VL]\n"
- ".inst 0x648aab6f // bfcvtnt z15.h, p2/M, z27.s\n"
- "zip1 z27.s, z18.s, z8.s\n"
- "zip2 z8.s, z18.s, z8.s\n"
- "ld1w { z18.s }, p2/Z, [x22, #5, MUL VL]\n"
- ".inst 0x648aab6d // bfcvtnt z13.h, p2/M, z27.s\n"
- "ld1w { z27.s }, p2/Z, [x22, #6, MUL VL]\n"
- ".inst 0x648aa90a // bfcvtnt z10.h, p2/M, z8.s\n"
- "zip1 z8.s, z21.s, z11.s\n"
- "zip2 z21.s, z21.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x22, #7, MUL VL]\n"
- "addvl x22, x22, #12\n"
- ".inst 0x648aa919 // bfcvtnt z25.h, p2/M, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x28, #-2, MUL VL]\n"
- ".inst 0x648aaaa6 // bfcvtnt z6.h, p2/M, z21.s\n"
- "zip1 z21.s, z3.s, z28.s\n"
- "zip2 z3.s, z3.s, z28.s\n"
- "ld1w { z28.s }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x648aaaa7 // bfcvtnt z7.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x24, #-4, MUL VL]\n"
- ".inst 0x648aa870 // bfcvtnt z16.h, p2/M, z3.s\n"
- "zip1 z3.s, z20.s, z31.s\n"
- "zip2 z31.s, z20.s, z31.s\n"
- "zip1 z20.s, z17.s, z19.s\n"
- "zip2 z17.s, z17.s, z19.s\n"
- "zip1 z19.s, z0.s, z1.s\n"
- "zip2 z1.s, z0.s, z1.s\n"
- "zip1 z0.s, z2.s, z26.s\n"
- "zip2 z2.s, z2.s, z26.s\n"
- "zip1 z26.s, z24.s, z5.s\n"
- "zip2 z24.s, z24.s, z5.s\n"
- "zip1 z5.s, z4.s, z29.s\n"
- "zip2 z4.s, z4.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x24, #-3, MUL VL]\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
- ".inst 0x658aaa94 // bfcvt z20.h, p2/M, z20.s\n"
- ".inst 0x658aaa31 // bfcvt z17.h, p2/M, z17.s\n"
- ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
- ".inst 0x658aa821 // bfcvt z1.h, p2/M, z1.s\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- ".inst 0x658aa842 // bfcvt z2.h, p2/M, z2.s\n"
- ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
- ".inst 0x658aab18 // bfcvt z24.h, p2/M, z24.s\n"
- ".inst 0x648aa8a9 // bfcvtnt z9.h, p2/M, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x23, #-2, MUL VL]\n"
- ".inst 0x648aa897 // bfcvtnt z23.h, p2/M, z4.s\n"
- "zip1 z4.s, z22.s, z18.s\n"
- "zip2 z22.s, z22.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x23, #-1, MUL VL]\n"
- ".inst 0x648aa883 // bfcvtnt z3.h, p2/M, z4.s\n"
- "ld1w { z4.s }, p2/Z, [x22, #-4, MUL VL]\n"
- ".inst 0x648aaadf // bfcvtnt z31.h, p2/M, z22.s\n"
- "zip1 z22.s, z14.s, z27.s\n"
- "zip2 z14.s, z14.s, z27.s\n"
- "ld1w { z27.s }, p2/Z, [x22, #-3, MUL VL]\n"
- ".inst 0x648aaad4 // bfcvtnt z20.h, p2/M, z22.s\n"
- "ld1w { z22.s }, p2/Z, [x24, #-2, MUL VL]\n"
- ".inst 0x648aa9d1 // bfcvtnt z17.h, p2/M, z14.s\n"
- "zip1 z14.s, z30.s, z11.s\n"
- "zip2 z11.s, z30.s, z11.s\n"
- "ld1w { z30.s }, p2/Z, [x24, #-1, MUL VL]\n"
- ".inst 0x648aa9d3 // bfcvtnt z19.h, p2/M, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x22, #-2, MUL VL]\n"
- ".inst 0x648aa961 // bfcvtnt z1.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x22, #-1, MUL VL]\n"
- "st1h { z12.h }, p2, [x21]\n"
- "zip1 z12.s, z21.s, z4.s\n"
- "zip2 z21.s, z21.s, z4.s\n"
- "zip1 z4.s, z29.s, z27.s\n"
- "zip2 z29.s, z29.s, z27.s\n"
- "st1h { z15.h }, p2, [x21, #1, MUL VL]\n"
- "zip1 z27.s, z8.s, z5.s\n"
- "zip2 z8.s, z8.s, z5.s\n"
- "st1h { z13.h }, p2, [x21, #2, MUL VL]\n"
- "zip1 z5.s, z28.s, z18.s\n"
- "zip2 z28.s, z28.s, z18.s\n"
- "st1h { z10.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z25.h }, p2, [x21, #4, MUL VL]\n"
- ".inst 0x648aa980 // bfcvtnt z0.h, p2/M, z12.s\n"
- ".inst 0x648aaaa2 // bfcvtnt z2.h, p2/M, z21.s\n"
- "st1h { z6.h }, p2, [x21, #5, MUL VL]\n"
- ".inst 0x648aa89a // bfcvtnt z26.h, p2/M, z4.s\n"
- ".inst 0x648aabb8 // bfcvtnt z24.h, p2/M, z29.s\n"
- "st1h { z7.h }, p2, [x21, #6, MUL VL]\n"
- ".inst 0x658aab7b // bfcvt z27.h, p2/M, z27.s\n"
- "zip1 z25.s, z22.s, z14.s\n"
- "st1h { z16.h }, p2, [x21, #7, MUL VL]\n"
+ ".inst 0x648ab96f // bfcvtnt z15.h, p6/M, z11.s\n"
+ "ld1w { z11.s }, p6/Z, [x28, #-2, MUL VL]\n"
+ ".inst 0x648ab883 // bfcvtnt z3.h, p6/M, z4.s\n"
+ "zip1 z4.s, z28.s, z24.s\n"
+ "zip2 z24.s, z28.s, z24.s\n"
+ "ld1w { z28.s }, p6/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x648ab892 // bfcvtnt z18.h, p6/M, z4.s\n"
+ "ld1w { z4.s }, p6/Z, [x27, #-4, MUL VL]\n"
+ ".inst 0x648abb06 // bfcvtnt z6.h, p6/M, z24.s\n"
+ "zip1 z24.s, z25.s, z2.s\n"
+ "zip2 z25.s, z25.s, z2.s\n"
+ "zip1 z2.s, z20.s, z19.s\n"
+ "zip2 z20.s, z20.s, z19.s\n"
+ "zip1 z19.s, z23.s, z16.s\n"
+ "zip2 z16.s, z23.s, z16.s\n"
+ "zip1 z23.s, z30.s, z1.s\n"
+ "zip2 z30.s, z30.s, z1.s\n"
+ "zip1 z1.s, z29.s, z0.s\n"
+ "zip2 z0.s, z29.s, z0.s\n"
+ ".inst 0x658aba31 // bfcvt z17.h, p6/M, z17.s\n"
+ "zip1 z29.s, z27.s, z8.s\n"
+ ".inst 0x658ab8a5 // bfcvt z5.h, p6/M, z5.s\n"
+ "zip2 z27.s, z27.s, z8.s\n"
+ "ld1w { z8.s }, p6/Z, [x27, #-3, MUL VL]\n"
+ ".inst 0x658abb18 // bfcvt z24.h, p6/M, z24.s\n"
+ ".inst 0x658abb39 // bfcvt z25.h, p6/M, z25.s\n"
+ ".inst 0x658ab842 // bfcvt z2.h, p6/M, z2.s\n"
+ ".inst 0x658aba94 // bfcvt z20.h, p6/M, z20.s\n"
+ ".inst 0x658aba73 // bfcvt z19.h, p6/M, z19.s\n"
+ ".inst 0x658aba10 // bfcvt z16.h, p6/M, z16.s\n"
+ ".inst 0x658abaf7 // bfcvt z23.h, p6/M, z23.s\n"
+ ".inst 0x658abbde // bfcvt z30.h, p6/M, z30.s\n"
+ ".inst 0x658ab821 // bfcvt z1.h, p6/M, z1.s\n"
+ ".inst 0x658ab800 // bfcvt z0.h, p6/M, z0.s\n"
+ ".inst 0x648abbb1 // bfcvtnt z17.h, p6/M, z29.s\n"
+ "ld1w { z29.s }, p6/Z, [x26, #-2, MUL VL]\n"
+ ".inst 0x648abb65 // bfcvtnt z5.h, p6/M, z27.s\n"
+ "zip1 z27.s, z13.s, z31.s\n"
+ "zip2 z31.s, z13.s, z31.s\n"
+ "ld1w { z13.s }, p6/Z, [x26, #-1, MUL VL]\n"
+ ".inst 0x648abb78 // bfcvtnt z24.h, p6/M, z27.s\n"
+ "ld1w { z27.s }, p6/Z, [x23, #-4, MUL VL]\n"
+ ".inst 0x648abbf9 // bfcvtnt z25.h, p6/M, z31.s\n"
+ "zip1 z31.s, z26.s, z7.s\n"
+ "zip2 z26.s, z26.s, z7.s\n"
+ "ld1w { z7.s }, p6/Z, [x23, #-3, MUL VL]\n"
+ ".inst 0x648abbe2 // bfcvtnt z2.h, p6/M, z31.s\n"
+ "ld1w { z31.s }, p6/Z, [x27, #-2, MUL VL]\n"
+ ".inst 0x648abb54 // bfcvtnt z20.h, p6/M, z26.s\n"
+ "zip1 z26.s, z9.s, z22.s\n"
+ "zip2 z9.s, z9.s, z22.s\n"
+ "ld1w { z22.s }, p6/Z, [x27, #-1, MUL VL]\n"
+ ".inst 0x648abb53 // bfcvtnt z19.h, p6/M, z26.s\n"
+ "ld1w { z26.s }, p6/Z, [x23, #-2, MUL VL]\n"
+ ".inst 0x648ab930 // bfcvtnt z16.h, p6/M, z9.s\n"
+ "ld1w { z9.s }, p6/Z, [x23, #-1, MUL VL]\n"
+ "st1h { z21.h }, p6, [x21]\n"
+ "zip1 z21.s, z4.s, z27.s\n"
+ "zip2 z27.s, z4.s, z27.s\n"
+ "zip1 z4.s, z8.s, z7.s\n"
+ "zip2 z8.s, z8.s, z7.s\n"
+ "st1h { z12.h }, p6, [x21, #1, MUL VL]\n"
+ "zip1 z7.s, z11.s, z29.s\n"
+ "zip2 z11.s, z11.s, z29.s\n"
+ "st1h { z14.h }, p6, [x21, #2, MUL VL]\n"
+ "zip1 z29.s, z28.s, z13.s\n"
+ "zip2 z12.s, z28.s, z13.s\n"
+ "st1h { z10.h }, p6, [x21, #3, MUL VL]\n"
+ "st1h { z15.h }, p6, [x21, #4, MUL VL]\n"
+ ".inst 0x648abab7 // bfcvtnt z23.h, p6/M, z21.s\n"
+ ".inst 0x648abb7e // bfcvtnt z30.h, p6/M, z27.s\n"
+ "st1h { z3.h }, p6, [x21, #5, MUL VL]\n"
+ ".inst 0x648ab881 // bfcvtnt z1.h, p6/M, z4.s\n"
+ ".inst 0x648ab900 // bfcvtnt z0.h, p6/M, z8.s\n"
+ "st1h { z18.h }, p6, [x21, #6, MUL VL]\n"
+ ".inst 0x658ab8e8 // bfcvt z8.h, p6/M, z7.s\n"
+ "zip1 z27.s, z31.s, z26.s\n"
+ "st1h { z6.h }, p6, [x21, #7, MUL VL]\n"
"addvl x21, x21, #12\n"
- ".inst 0x658aa906 // bfcvt z6.h, p2/M, z8.s\n"
- "zip2 z4.s, z22.s, z14.s\n"
- ".inst 0x658aa8b2 // bfcvt z18.h, p2/M, z5.s\n"
- "zip1 z22.s, z30.s, z11.s\n"
- ".inst 0x658aab95 // bfcvt z21.h, p2/M, z28.s\n"
- "zip2 z16.s, z30.s, z11.s\n"
- "st1h { z9.h }, p2, [x21, #-4, MUL VL]\n"
- "st1h { z23.h }, p2, [x21, #-3, MUL VL]\n"
- ".inst 0x648aab3b // bfcvtnt z27.h, p2/M, z25.s\n"
- ".inst 0x648aa886 // bfcvtnt z6.h, p2/M, z4.s\n"
- "st1h { z3.h }, p2, [x21, #-2, MUL VL]\n"
- ".inst 0x648aaad2 // bfcvtnt z18.h, p2/M, z22.s\n"
- "st1h { z31.h }, p2, [x21, #-1, MUL VL]\n"
- ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
- "st1h { z20.h }, p2, [x20]\n"
- "st1h { z17.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z19.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z1.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z0.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z2.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z26.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z24.h }, p2, [x20, #7, MUL VL]\n"
+ ".inst 0x658ab96e // bfcvt z14.h, p6/M, z11.s\n"
+ "zip2 z28.s, z31.s, z26.s\n"
+ ".inst 0x658abbbd // bfcvt z29.h, p6/M, z29.s\n"
+ "zip1 z21.s, z22.s, z9.s\n"
+ "st1h { z17.h }, p6, [x21, #-4, MUL VL]\n"
+ ".inst 0x658ab992 // bfcvt z18.h, p6/M, z12.s\n"
+ "zip2 z17.s, z22.s, z9.s\n"
+ "st1h { z5.h }, p6, [x21, #-3, MUL VL]\n"
+ "st1h { z24.h }, p6, [x21, #-2, MUL VL]\n"
+ ".inst 0x648abb68 // bfcvtnt z8.h, p6/M, z27.s\n"
+ ".inst 0x648abb8e // bfcvtnt z14.h, p6/M, z28.s\n"
+ "st1h { z25.h }, p6, [x21, #-1, MUL VL]\n"
+ ".inst 0x648ababd // bfcvtnt z29.h, p6/M, z21.s\n"
+ ".inst 0x648aba32 // bfcvtnt z18.h, p6/M, z17.s\n"
+ "st1h { z2.h }, p6, [x20]\n"
+ "st1h { z20.h }, p6, [x20, #1, MUL VL]\n"
+ "st1h { z19.h }, p6, [x20, #2, MUL VL]\n"
+ "st1h { z16.h }, p6, [x20, #3, MUL VL]\n"
+ "st1h { z23.h }, p6, [x20, #4, MUL VL]\n"
+ "st1h { z30.h }, p6, [x20, #5, MUL VL]\n"
+ "st1h { z1.h }, p6, [x20, #6, MUL VL]\n"
+ "st1h { z0.h }, p6, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
- "st1h { z27.h }, p2, [x20, #-4, MUL VL]\n"
- "st1h { z6.h }, p2, [x20, #-3, MUL VL]\n"
- "st1h { z18.h }, p2, [x20, #-2, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #-1, MUL VL]\n"
+ "st1h { z8.h }, p6, [x20, #-4, MUL VL]\n"
+ "st1h { z14.h }, p6, [x20, #-3, MUL VL]\n"
+ "st1h { z29.h }, p6, [x20, #-2, MUL VL]\n"
+ "st1h { z18.h }, p6, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x27, 5f\n"
+ "cbz x25, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x27\n"
"mov x20, x25\n"
- "decd x27, ALL, MUL #12\n"
- "add x25, x25, %x[out_stride]\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
- "whilelt p0.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z19.s }, p1/Z, [x28]\n"
- "ld1w { z18.s }, p1/Z, [x23]\n"
- "ld1w { z30.s }, p1/Z, [x24]\n"
- "ld1w { z29.s }, p1/Z, [x22]\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z21.s }, p0/Z, [x28, #1, MUL VL]\n"
- "ld1w { z17.s }, p0/Z, [x23, #1, MUL VL]\n"
- "ld1w { z28.s }, p0/Z, [x24, #1, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x22, #1, MUL VL]\n"
- "zip1 z16.s, z19.s, z18.s\n"
- "zip2 z26.s, z19.s, z18.s\n"
- "whilelt p0.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z20.s }, p1/Z, [x28, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z25.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z24.s }, p1/Z, [x22, #2, MUL VL]\n"
- "zip1 z18.s, z21.s, z17.s\n"
- "zip2 z23.s, z21.s, z17.s\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "zip1 z9.s, z30.s, z29.s\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z17.s }, p0/Z, [x28, #3, MUL VL]\n"
- "ld1w { z16.s }, p0/Z, [x23, #3, MUL VL]\n"
- "zip1 z22.s, z20.s, z19.s\n"
- "zip2 z21.s, z20.s, z19.s\n"
- "ld1w { z20.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x658aab48 // bfcvt z8.h, p2/M, z26.s\n"
- "zip2 z7.s, z30.s, z29.s\n"
- "whilelt p0.s, XZR, x21\n"
- "ld1w { z6.s }, p1/Z, [x28, #4, MUL VL]\n"
- "ld1w { z5.s }, p1/Z, [x23, #4, MUL VL]\n"
- ".inst 0x658aaa44 // bfcvt z4.h, p2/M, z18.s\n"
- "zip1 z18.s, z17.s, z16.s\n"
- "zip2 z17.s, z17.s, z16.s\n"
- "ld1w { z3.s }, p1/Z, [x24, #4, MUL VL]\n"
- "ld1w { z2.s }, p1/Z, [x22, #4, MUL VL]\n"
- "zip1 z1.s, z28.s, z27.s\n"
- ".inst 0x658aaae0 // bfcvt z0.h, p2/M, z23.s\n"
- "cmp x27, #0x0\n"
- "ld1w { z31.s }, p0/Z, [x28, #5, MUL VL]\n"
- "ld1w { z16.s }, p0/Z, [x23, #5, MUL VL]\n"
- "ld1w { z30.s }, p0/Z, [x24, #5, MUL VL]\n"
- "zip2 z29.s, z28.s, z27.s\n"
- ".inst 0x658aaadc // bfcvt z28.h, p2/M, z22.s\n"
- "ld1w { z27.s }, p0/Z, [x22, #5, MUL VL]\n"
- "zip1 z23.s, z25.s, z24.s\n"
- ".inst 0x658aaaba // bfcvt z26.h, p2/M, z21.s\n"
+ "whilelt p5.s, XZR, x20\n"
+ "ld1w { z22.s }, p5/Z, [x28]\n"
+ "ld1w { z21.s }, p5/Z, [x26]\n"
+ "decw x20\n"
+ "whilelt p4.s, XZR, x20\n"
+ "ld1w { z20.s }, p4/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x26, #1, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p3.s, XZR, x20\n"
+ "ld1w { z18.s }, p3/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z17.s }, p3/Z, [x26, #2, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p2.s, XZR, x20\n"
+ "ld1w { z30.s }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x26, #3, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p1.s, XZR, x20\n"
+ "ld1w { z13.s }, p1/Z, [x28, #4, MUL VL]\n"
+ "ld1w { z29.s }, p5/Z, [x27]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z12.s }, p0/Z, [x28, #5, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z11.s }, p3/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "zip1 z27.s, z22.s, z21.s\n"
+ "zip2 z26.s, z22.s, z21.s\n"
+ "ld1w { z9.s }, p1/Z, [x26, #4, MUL VL]\n"
+ "ld1w { z8.s }, p0/Z, [x26, #5, MUL VL]\n"
+ "zip1 z25.s, z20.s, z19.s\n"
+ "zip2 z24.s, z20.s, z19.s\n"
+ "ld1w { z23.s }, p5/Z, [x23]\n"
+ "ld1w { z22.s }, p4/Z, [x23, #1, MUL VL]\n"
+ "zip1 z21.s, z18.s, z17.s\n"
+ "zip2 z20.s, z18.s, z17.s\n"
+ "ld1w { z19.s }, p3/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #3, MUL VL]\n"
+ "zip1 z17.s, z30.s, z16.s\n"
+ "zip2 z16.s, z30.s, z16.s\n"
+ "ld1w { z7.s }, p1/Z, [x27, #4, MUL VL]\n"
+ "ld1w { z6.s }, p0/Z, [x27, #5, MUL VL]\n"
+ ".inst 0x658abb65 // bfcvt z5.h, p6/M, z27.s\n"
+ "zip1 z4.s, z29.s, z23.s\n"
+ "ld1w { z3.s }, p1/Z, [x23, #4, MUL VL]\n"
+ "ld1w { z2.s }, p0/Z, [x23, #5, MUL VL]\n"
+ ".inst 0x658abb41 // bfcvt z1.h, p6/M, z26.s\n"
+ "zip2 z0.s, z29.s, z23.s\n"
+ ".inst 0x658abb3f // bfcvt z31.h, p6/M, z25.s\n"
+ "zip1 z30.s, z28.s, z22.s\n"
+ "mov x20, x22\n"
+ "decd x25, ALL, MUL #12\n"
+ ".inst 0x658abb1d // bfcvt z29.h, p6/M, z24.s\n"
+ "zip2 z28.s, z28.s, z22.s\n"
+ "cmp x25, #0x0\n"
"addvl x28, x28, #6\n"
- "zip2 z22.s, z25.s, z24.s\n"
- ".inst 0x658aaa59 // bfcvt z25.h, p2/M, z18.s\n"
- "addvl x24, x24, #6\n"
+ ".inst 0x658ababb // bfcvt z27.h, p6/M, z21.s\n"
+ "zip1 z23.s, z11.s, z19.s\n"
+ "addvl x27, x27, #6\n"
+ "addvl x26, x26, #6\n"
+ ".inst 0x658aba9a // bfcvt z26.h, p6/M, z20.s\n"
+ "zip2 z22.s, z11.s, z19.s\n"
"addvl x23, x23, #6\n"
- "zip1 z21.s, z20.s, z19.s\n"
- ".inst 0x658aaa38 // bfcvt z24.h, p2/M, z17.s\n"
- "addvl x22, x22, #6\n"
- "zip2 z20.s, z20.s, z19.s\n"
- "zip1 z19.s, z6.s, z5.s\n"
- "zip2 z18.s, z6.s, z5.s\n"
- "zip1 z17.s, z31.s, z16.s\n"
- "zip2 z16.s, z31.s, z16.s\n"
- ".inst 0x648aa92a // bfcvtnt z10.h, p2/M, z9.s\n"
- ".inst 0x648aa8e8 // bfcvtnt z8.h, p2/M, z7.s\n"
- ".inst 0x648aa824 // bfcvtnt z4.h, p2/M, z1.s\n"
- ".inst 0x648aaba0 // bfcvtnt z0.h, p2/M, z29.s\n"
- ".inst 0x648aaafc // bfcvtnt z28.h, p2/M, z23.s\n"
- ".inst 0x648aaada // bfcvtnt z26.h, p2/M, z22.s\n"
- ".inst 0x648aaab9 // bfcvtnt z25.h, p2/M, z21.s\n"
- "st1h { z10.h }, p2, [x20]\n"
- ".inst 0x648aaa98 // bfcvtnt z24.h, p2/M, z20.s\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "zip1 z22.s, z3.s, z2.s\n"
- ".inst 0x658aaa55 // bfcvt z21.h, p2/M, z18.s\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
- "zip2 z20.s, z3.s, z2.s\n"
- ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
- "st1h { z0.h }, p2, [x20, #3, MUL VL]\n"
- "zip1 z18.s, z30.s, z27.s\n"
- ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
- "st1h { z28.h }, p2, [x20, #4, MUL VL]\n"
- "zip2 z16.s, z30.s, z27.s\n"
- "st1h { z26.h }, p2, [x20, #5, MUL VL]\n"
- ".inst 0x648aaad7 // bfcvtnt z23.h, p2/M, z22.s\n"
- "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
- ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
- "st1h { z24.h }, p2, [x20, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ ".inst 0x658aba39 // bfcvt z25.h, p6/M, z17.s\n"
+ "zip1 z21.s, z10.s, z18.s\n"
+ ".inst 0x658aba18 // bfcvt z24.h, p6/M, z16.s\n"
+ "zip2 z20.s, z10.s, z18.s\n"
+ "zip1 z19.s, z13.s, z9.s\n"
+ "zip2 z18.s, z13.s, z9.s\n"
+ "zip1 z17.s, z12.s, z8.s\n"
+ "zip2 z16.s, z12.s, z8.s\n"
+ ".inst 0x648ab885 // bfcvtnt z5.h, p6/M, z4.s\n"
+ ".inst 0x648ab801 // bfcvtnt z1.h, p6/M, z0.s\n"
+ "st1h { z5.h }, p6, [x20]\n"
+ ".inst 0x648abbdf // bfcvtnt z31.h, p6/M, z30.s\n"
+ ".inst 0x648abb9d // bfcvtnt z29.h, p6/M, z28.s\n"
+ "st1h { z1.h }, p6, [x20, #1, MUL VL]\n"
+ ".inst 0x648abafb // bfcvtnt z27.h, p6/M, z23.s\n"
+ ".inst 0x648abada // bfcvtnt z26.h, p6/M, z22.s\n"
+ "st1h { z31.h }, p6, [x20, #2, MUL VL]\n"
+ ".inst 0x648abab9 // bfcvtnt z25.h, p6/M, z21.s\n"
+ ".inst 0x648aba98 // bfcvtnt z24.h, p6/M, z20.s\n"
+ "st1h { z29.h }, p6, [x20, #3, MUL VL]\n"
+ ".inst 0x658aba77 // bfcvt z23.h, p6/M, z19.s\n"
+ "zip1 z22.s, z7.s, z3.s\n"
+ "st1h { z27.h }, p6, [x20, #4, MUL VL]\n"
+ ".inst 0x658aba55 // bfcvt z21.h, p6/M, z18.s\n"
+ "zip2 z20.s, z7.s, z3.s\n"
+ "st1h { z26.h }, p6, [x20, #5, MUL VL]\n"
+ ".inst 0x658aba33 // bfcvt z19.h, p6/M, z17.s\n"
+ "zip1 z18.s, z6.s, z2.s\n"
+ "st1h { z25.h }, p6, [x20, #6, MUL VL]\n"
+ ".inst 0x658aba11 // bfcvt z17.h, p6/M, z16.s\n"
+ "zip2 z16.s, z6.s, z2.s\n"
+ "st1h { z24.h }, p6, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
- ".inst 0x648aaa53 // bfcvtnt z19.h, p2/M, z18.s\n"
- ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
- "st1h { z23.h }, p2, [x20, #-4, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #-3, MUL VL]\n"
- "st1h { z19.h }, p2, [x20, #-2, MUL VL]\n"
- "st1h { z17.h }, p2, [x20, #-1, MUL VL]\n"
+ ".inst 0x648abad7 // bfcvtnt z23.h, p6/M, z22.s\n"
+ ".inst 0x648aba95 // bfcvtnt z21.h, p6/M, z20.s\n"
+ "st1h { z23.h }, p6, [x20, #-4, MUL VL]\n"
+ ".inst 0x648aba53 // bfcvtnt z19.h, p6/M, z18.s\n"
+ ".inst 0x648aba11 // bfcvtnt z17.h, p6/M, z16.s\n"
+ "st1h { z21.h }, p6, [x20, #-3, MUL VL]\n"
+ "st1h { z19.h }, p6, [x20, #-2, MUL VL]\n"
+ "st1h { z17.h }, p6, [x20, #-1, MUL VL]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -354,7 +354,7 @@ void sve_transpose_interleave_12VL_2x4_fp32bf16(bfloat16 *out, const float *in,
"bge 1b\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
index e37879e19b..beddf76c5b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
@@ -40,59 +40,59 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt
"mov x26, %x[in]\n"
"mov x25, %x[width]\n"
"cntw x24, ALL, MUL #2\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x22, x26, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
+ "add x23, x26, %x[in_stride]\n"
+ "add x21, x23, %x[in_stride]\n"
"add x20, x21, %x[in_stride]\n"
"cmp x25, x24\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
"sub x25, x25, x24\n"
"ld1w { z23.s }, p1/Z, [x26]\n"
"ld1w { z22.s }, p1/Z, [x26, #1, MUL VL]\n"
- "addvl x26, x26, #2\n"
- "ld1w { z21.s }, p1/Z, [x22]\n"
- "ld1w { z20.s }, p1/Z, [x22, #1, MUL VL]\n"
"cmp x25, x24\n"
- "addvl x22, x22, #2\n"
+ "ld1w { z21.s }, p1/Z, [x23]\n"
+ "ld1w { z20.s }, p1/Z, [x23, #1, MUL VL]\n"
+ "addvl x26, x26, #2\n"
+ "addvl x23, x23, #2\n"
"ld1w { z19.s }, p1/Z, [x21]\n"
"ld1w { z18.s }, p1/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
"ld1w { z17.s }, p1/Z, [x20]\n"
"ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
- "st1w { z23.s }, p1, [x23]\n"
+ "st1w { z23.s }, p1, [x22]\n"
"addvl x20, x20, #2\n"
- "st1w { z21.s }, p1, [x23, #1, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z17.s }, p1, [x23, #3, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
- "st1w { z22.s }, p1, [x23]\n"
- "st1w { z20.s }, p1, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z16.s }, p1, [x23, #3, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
+ "st1w { z21.s }, p1, [x22, #1, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z17.s }, p1, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1w { z22.s }, p1, [x22]\n"
+ "st1w { z20.s }, p1, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z16.s }, p1, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cbz x25, 5f\n"
"4:" // Main row loop: Column loop
"whilelt p0.s, XZR, x25\n"
"decw x25\n"
- "cmp x25, #0x0\n"
"ld1w { z19.s }, p0/Z, [x26]\n"
+ "ld1w { z18.s }, p0/Z, [x23]\n"
+ "cmp x25, #0x0\n"
"addvl x26, x26, #1\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
- "addvl x22, x22, #1\n"
"ld1w { z17.s }, p0/Z, [x21]\n"
- "addvl x21, x21, #1\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
+ "addvl x23, x23, #1\n"
+ "addvl x21, x21, #1\n"
+ "st1w { z19.s }, p1, [x22]\n"
"addvl x20, x20, #1\n"
- "st1w { z19.s }, p1, [x23]\n"
- "st1w { z18.s }, p1, [x23, #1, MUL VL]\n"
- "st1w { z17.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z16.s }, p1, [x23, #3, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
+ "st1w { z18.s }, p1, [x22, #1, MUL VL]\n"
+ "st1w { z17.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z16.s }, p1, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -104,32 +104,32 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt
"mov x21, %x[width]\n"
"cntw x20, ALL, MUL #2\n"
"mov x26, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x21, x20\n"
"add %x[in], x26, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
"sub x21, x21, x20\n"
"ld1w { z17.s }, p1/Z, [x26]\n"
"ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "addvl x26, x26, #2\n"
+ "st1w { z17.s }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
"cmp x21, x20\n"
- "st1w { z17.s }, p1, [x23]\n"
- "add x23, x23, %x[out_stride]\n"
- "st1w { z16.s }, p1, [x23]\n"
- "add x23, x23, %x[out_stride]\n"
+ "st1w { z16.s }, p1, [x22]\n"
+ "addvl x26, x26, #2\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.s, XZR, x21\n"
"decw x21\n"
- "cmp x21, #0x0\n"
"ld1w { z16.s }, p0/Z, [x26]\n"
+ "st1w { z16.s }, p1, [x22]\n"
+ "cmp x21, #0x0\n"
"addvl x26, x26, #1\n"
- "st1w { z16.s }, p1, [x23]\n"
- "add x23, x23, %x[out_stride]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
index 60ac125bff..1103008fe2 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
@@ -44,135 +44,135 @@ void sve_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"blt 6f\n"
"1:" // Main row loop: Head
"mov x10, %x[in]\n"
- "mov x9, %x[width]\n"
- "cntb x28, ALL, MUL #2\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x10, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
- "cmp x9, x28\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
+ "mov x25, %x[width]\n"
+ "cntb x24, ALL, MUL #2\n"
+ "add x23, x26, %x[in_stride]\n"
+ "add x21, x23, %x[in_stride]\n"
"add x20, x21, %x[in_stride]\n"
+ "cmp x25, x24\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z25.b }, p1/Z, [x10]\n"
- "ld1b { z24.b }, p1/Z, [x26]\n"
- "sub x9, x9, x28\n"
- "ld1b { z20.b }, p1/Z, [x25]\n"
- "ld1b { z17.b }, p1/Z, [x24]\n"
- "cmp x9, x28\n"
- "ld1b { z23.b }, p1/Z, [x23]\n"
- "ld1b { z22.b }, p1/Z, [x22]\n"
- "ld1b { z21.b }, p1/Z, [x21]\n"
- "ld1b { z19.b }, p1/Z, [x20]\n"
- "zip1 z18.b, z25.b, z20.b\n"
- "zip1 z16.b, z24.b, z17.b\n"
- "ld1b { z4.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z3.b }, p1/Z, [x26, #1, MUL VL]\n"
- "zip2 z2.b, z25.b, z20.b\n"
- "zip2 z1.b, z24.b, z17.b\n"
- "ld1b { z25.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z0.b }, p1/Z, [x24, #1, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x10]\n"
+ "ld1b { z18.b }, p1/Z, [x9]\n"
+ "sub x25, x25, x24\n"
+ "cmp x25, x24\n"
+ "ld1b { z17.b }, p1/Z, [x28]\n"
+ "ld1b { z16.b }, p1/Z, [x27]\n"
+ "zip1 z25.b, z20.b, z17.b\n"
+ "zip1 z24.b, z18.b, z16.b\n"
+ "ld1b { z21.b }, p1/Z, [x26]\n"
+ "ld1b { z19.b }, p1/Z, [x23]\n"
+ "zip2 z2.b, z20.b, z17.b\n"
+ "zip2 z1.b, z18.b, z16.b\n"
+ "ld1b { z18.b }, p1/Z, [x21]\n"
+ "ld1b { z17.b }, p1/Z, [x20]\n"
+ "zip1 z20.b, z21.b, z18.b\n"
+ "zip1 z16.b, z19.b, z17.b\n"
+ "ld1b { z0.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z31.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z30.b, z21.b, z18.b\n"
+ "zip2 z29.b, z19.b, z17.b\n"
+ "ld1b { z23.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z22.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip1 z19.b, z25.b, z24.b\n"
+ "zip1 z18.b, z20.b, z16.b\n"
+ "ld1b { z28.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z27.b }, p1/Z, [x23, #1, MUL VL]\n"
+ "zip2 z17.b, z25.b, z24.b\n"
+ "zip2 z16.b, z20.b, z16.b\n"
+ "ld1b { z21.b }, p1/Z, [x21, #1, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x20, #1, MUL VL]\n"
+ "st1b { z19.b }, p1, [x22]\n"
+ "zip1 z26.b, z0.b, z23.b\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z25.b, z31.b, z22.b\n"
+ "zip1 z24.b, z28.b, z21.b\n"
+ "st1b { z17.b }, p1, [x22]\n"
+ "zip1 z19.b, z27.b, z20.b\n"
+ "zip1 z17.b, z2.b, z1.b\n"
"addvl x10, x10, #2\n"
+ "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z18.b, z30.b, z29.b\n"
+ "zip2 z16.b, z2.b, z1.b\n"
+ "st1b { z17.b }, p1, [x22]\n"
+ "zip2 z17.b, z30.b, z29.b\n"
+ "zip2 z23.b, z0.b, z23.b\n"
+ "addvl x9, x9, #2\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z22.b, z31.b, z22.b\n"
+ "zip2 z21.b, z28.b, z21.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "zip2 z20.b, z27.b, z20.b\n"
+ "zip1 z16.b, z26.b, z25.b\n"
+ "addvl x28, x28, #2\n"
+ "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z18.b, z24.b, z19.b\n"
+ "zip2 z17.b, z26.b, z25.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "zip2 z16.b, z24.b, z19.b\n"
+ "zip1 z19.b, z23.b, z22.b\n"
+ "addvl x27, x27, #2\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z18.b, z21.b, z20.b\n"
"addvl x26, x26, #2\n"
- "zip1 z20.b, z23.b, z21.b\n"
- "zip1 z17.b, z22.b, z19.b\n"
- "ld1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1b { z30.b }, p1/Z, [x22, #1, MUL VL]\n"
- "ld1b { z24.b }, p1/Z, [x21, #1, MUL VL]\n"
- "ld1b { z29.b }, p1/Z, [x20, #1, MUL VL]\n"
- "zip2 z23.b, z23.b, z21.b\n"
- "zip2 z22.b, z22.b, z19.b\n"
- "zip1 z19.b, z18.b, z16.b\n"
- "zip2 z18.b, z18.b, z16.b\n"
- "addvl x25, x25, #2\n"
- "addvl x24, x24, #2\n"
- "zip1 z16.b, z20.b, z17.b\n"
- "zip2 z17.b, z20.b, z17.b\n"
+ "st1b { z17.b }, p1, [x22]\n"
"addvl x23, x23, #2\n"
- "addvl x22, x22, #2\n"
- "zip1 z28.b, z4.b, z25.b\n"
- "zip1 z21.b, z3.b, z0.b\n"
"addvl x21, x21, #2\n"
+ "zip2 z17.b, z23.b, z22.b\n"
+ "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"addvl x20, x20, #2\n"
- "st1b { z19.b }, p1, [x27]\n"
- "zip1 z27.b, z31.b, z24.b\n"
- "zip1 z26.b, z30.b, z29.b\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z20.b, z2.b, z1.b\n"
- "zip1 z16.b, z23.b, z22.b\n"
- "st1b { z18.b }, p1, [x27]\n"
- "zip2 z19.b, z2.b, z1.b\n"
- "zip2 z18.b, z23.b, z22.b\n"
- "st1b { z17.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip2 z25.b, z4.b, z25.b\n"
- "zip2 z17.b, z3.b, z0.b\n"
- "st1b { z20.b }, p1, [x27]\n"
- "zip2 z24.b, z31.b, z24.b\n"
- "zip2 z23.b, z30.b, z29.b\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z16.b, z28.b, z21.b\n"
- "zip1 z22.b, z27.b, z26.b\n"
- "st1b { z19.b }, p1, [x27]\n"
- "zip2 z21.b, z28.b, z21.b\n"
- "zip2 z20.b, z27.b, z26.b\n"
- "st1b { z18.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z19.b, z25.b, z17.b\n"
- "zip1 z18.b, z24.b, z23.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "zip2 z17.b, z25.b, z17.b\n"
- "zip2 z16.b, z24.b, z23.b\n"
- "st1b { z22.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z19.b }, p1, [x27]\n"
- "st1b { z18.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z17.b }, p1, [x27]\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip2 z16.b, z21.b, z20.b\n"
+ "st1b { z19.b }, p1, [x22]\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1b { z17.b }, p1, [x22]\n"
+ "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x9, 5f\n"
+ "cbz x25, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x9\n"
- "decw x9\n"
- "ld1b { z23.b }, p0/Z, [x10]\n"
+ "whilelt p0.b, XZR, x25\n"
+ "ld1b { z19.b }, p0/Z, [x10]\n"
+ "ld1b { z18.b }, p0/Z, [x9]\n"
+ "decw x25\n"
+ "ld1b { z17.b }, p0/Z, [x28]\n"
+ "ld1b { z16.b }, p0/Z, [x27]\n"
+ "zip1 z21.b, z19.b, z17.b\n"
+ "zip1 z20.b, z18.b, z16.b\n"
+ "ld1b { z18.b }, p0/Z, [x26]\n"
+ "ld1b { z19.b }, p0/Z, [x23]\n"
+ "cmp x25, #0x0\n"
"incd x10, ALL, MUL #2\n"
- "ld1b { z22.b }, p0/Z, [x26]\n"
+ "ld1b { z17.b }, p0/Z, [x21]\n"
+ "ld1b { z16.b }, p0/Z, [x20]\n"
+ "zip1 z18.b, z18.b, z17.b\n"
+ "zip1 z16.b, z19.b, z16.b\n"
+ "incd x9, ALL, MUL #2\n"
+ "incd x28, ALL, MUL #2\n"
+ "zip1 z17.b, z21.b, z20.b\n"
+ "zip1 z16.b, z18.b, z16.b\n"
+ "incd x27, ALL, MUL #2\n"
"incd x26, ALL, MUL #2\n"
- "ld1b { z19.b }, p0/Z, [x25]\n"
- "incd x25, ALL, MUL #2\n"
- "ld1b { z17.b }, p0/Z, [x24]\n"
- "incd x24, ALL, MUL #2\n"
- "ld1b { z21.b }, p0/Z, [x23]\n"
- "ld1b { z20.b }, p0/Z, [x22]\n"
- "ld1b { z18.b }, p0/Z, [x21]\n"
- "cmp x9, #0x0\n"
+ "st1b { z17.b }, p1, [x22]\n"
"incd x23, ALL, MUL #2\n"
- "ld1b { z16.b }, p0/Z, [x20]\n"
- "zip1 z19.b, z23.b, z19.b\n"
- "incd x22, ALL, MUL #2\n"
"incd x21, ALL, MUL #2\n"
- "zip1 z17.b, z22.b, z17.b\n"
+ "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
"incd x20, ALL, MUL #2\n"
- "zip1 z18.b, z21.b, z18.b\n"
- "zip1 z16.b, z20.b, z16.b\n"
- "zip1 z17.b, z19.b, z17.b\n"
- "zip1 z16.b, z18.b, z16.b\n"
- "st1b { z17.b }, p1, [x27]\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x8\n"
@@ -182,88 +182,88 @@ void sve_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x10, %x[in]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cntb x20, ALL, MUL #2\n"
+ "add x27, x28, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x10, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "csel x25, x25, %x[pad_row], GE\n"
+ "add %x[in], x27, %x[in_stride]\n"
+ "csel x27, x27, %x[pad_row], GT\n"
+ "csel x28, x28, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x26, x26, %x[pad_row], GT\n"
+ "csel x9, x9, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
"ld1b { z21.b }, p1/Z, [x10]\n"
- "ld1b { z19.b }, p1/Z, [x26]\n"
+ "ld1b { z18.b }, p1/Z, [x9]\n"
"sub x21, x21, x20\n"
- "ld1b { z17.b }, p1/Z, [x25]\n"
- "ld1b { z16.b }, p1/Z, [x24]\n"
"cmp x21, x20\n"
- "ld1b { z27.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z26.b }, p1/Z, [x26, #1, MUL VL]\n"
- "addvl x10, x10, #2\n"
- "addvl x26, x26, #2\n"
- "ld1b { z25.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z24.b }, p1/Z, [x24, #1, MUL VL]\n"
- "addvl x25, x25, #2\n"
- "addvl x24, x24, #2\n"
+ "ld1b { z17.b }, p1/Z, [x28]\n"
+ "ld1b { z16.b }, p1/Z, [x27]\n"
"zip1 z20.b, z21.b, z17.b\n"
- "zip1 z18.b, z19.b, z16.b\n"
- "zip2 z17.b, z21.b, z17.b\n"
- "zip2 z16.b, z19.b, z16.b\n"
- "zip1 z23.b, z27.b, z25.b\n"
- "zip1 z22.b, z26.b, z24.b\n"
- "zip1 z19.b, z20.b, z18.b\n"
- "zip2 z18.b, z20.b, z18.b\n"
- "zip1 z21.b, z17.b, z16.b\n"
- "zip2 z17.b, z17.b, z16.b\n"
- "zip2 z20.b, z27.b, z25.b\n"
- "zip2 z16.b, z26.b, z24.b\n"
- "st1b { z19.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z18.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z19.b, z23.b, z22.b\n"
- "zip2 z18.b, z23.b, z22.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z17.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z17.b, z20.b, z16.b\n"
- "zip2 z16.b, z20.b, z16.b\n"
- "st1b { z19.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z18.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z17.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z16.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 z19.b, z18.b, z16.b\n"
+ "ld1b { z24.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z23.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z22.b, z21.b, z17.b\n"
+ "zip2 z21.b, z18.b, z16.b\n"
+ "ld1b { z18.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z17.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z16.b, z20.b, z19.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z20.b, z24.b, z18.b\n"
+ "zip1 z19.b, z23.b, z17.b\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "zip1 z16.b, z22.b, z21.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z16.b, z22.b, z21.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z18.b, z24.b, z18.b\n"
+ "zip2 z17.b, z23.b, z17.b\n"
+ "zip1 z16.b, z20.b, z19.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z16.b, z20.b, z19.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z16.b, z18.b, z17.b\n"
+ "addvl x28, x28, #2\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "addvl x27, x27, #2\n"
+ "zip2 z16.b, z18.b, z17.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.b, XZR, x21\n"
- "decw x21\n"
"ld1b { z19.b }, p0/Z, [x10]\n"
- "incd x10, ALL, MUL #2\n"
- "ld1b { z18.b }, p0/Z, [x26]\n"
- "incd x26, ALL, MUL #2\n"
- "ld1b { z17.b }, p0/Z, [x25]\n"
- "incd x25, ALL, MUL #2\n"
- "ld1b { z16.b }, p0/Z, [x24]\n"
- "incd x24, ALL, MUL #2\n"
- "cmp x21, #0x0\n"
+ "ld1b { z18.b }, p0/Z, [x9]\n"
+ "decw x21\n"
+ "ld1b { z17.b }, p0/Z, [x28]\n"
+ "ld1b { z16.b }, p0/Z, [x27]\n"
"zip1 z17.b, z19.b, z17.b\n"
"zip1 z16.b, z18.b, z16.b\n"
+ "cmp x21, #0x0\n"
+ "incd x10, ALL, MUL #2\n"
"zip1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "incd x9, ALL, MUL #2\n"
+ "incd x28, ALL, MUL #2\n"
+ "incd x27, ALL, MUL #2\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -272,7 +272,7 @@ void sve_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"12:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
index e10f818ba2..0e138e4422 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
@@ -34,57 +34,57 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "ptrue p2.b\n"
+ "ptrue p3.b\n"
"blt 4f\n"
"1:" // Main row loop: Head
- "mov x27, %x[in]\n"
- "mov x26, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "mov x25, %x[width]\n"
- "add x24, x27, %x[in_stride]\n"
+ "mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x21, x25\n"
- "mov x20, x26\n"
- "dech x25, ALL, MUL #3\n"
- "add x26, x26, %x[out_stride]\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z27.h }, p0/Z, [x27]\n"
- "ld1h { z26.h }, p0/Z, [x24]\n"
- "ld1h { z25.h }, p0/Z, [x23]\n"
- "ld1h { z24.h }, p0/Z, [x22]\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x25, #0x0\n"
- "ld1h { z23.h }, p1/Z, [x27, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x24, #1, MUL VL]\n"
- "ld1h { z21.h }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1h { z20.h }, p1/Z, [x22, #1, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x27, #2, MUL VL]\n"
- "ld1h { z18.h }, p0/Z, [x24, #2, MUL VL]\n"
- "addvl x27, x27, #3\n"
+ "mov x20, x21\n"
+ "whilelt p2.h, XZR, x20\n"
+ "ld1h { z27.h }, p2/Z, [x26]\n"
+ "ld1h { z26.h }, p2/Z, [x25]\n"
+ "dech x20\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z25.h }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z24.h }, p1/Z, [x25, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z23.h }, p0/Z, [x26, #2, MUL VL]\n"
+ "ld1h { z22.h }, p0/Z, [x25, #2, MUL VL]\n"
+ "mov x20, x22\n"
+ "dech x21, ALL, MUL #3\n"
+ "ld1h { z21.h }, p2/Z, [x24]\n"
+ "ld1h { z20.h }, p1/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "cmp x21, #0x0\n"
+ "addvl x26, x26, #3\n"
+ "ld1h { z17.h }, p1/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x23, #2, MUL VL]\n"
+ "st1h { z27.h }, p3, [x20]\n"
+ "addvl x25, x25, #3\n"
+ "st1h { z25.h }, p3, [x20, #1, MUL VL]\n"
"addvl x24, x24, #3\n"
- "ld1h { z17.h }, p0/Z, [x23, #2, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x22, #2, MUL VL]\n"
- "st1h { z27.h }, p2, [x20]\n"
"addvl x23, x23, #3\n"
- "st1h { z23.h }, p2, [x20, #1, MUL VL]\n"
- "addvl x22, x22, #3\n"
- "st1h { z19.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z26.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z22.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z18.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #7, MUL VL]\n"
+ "st1h { z23.h }, p3, [x20, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z26.h }, p3, [x20, #3, MUL VL]\n"
+ "st1h { z24.h }, p3, [x20, #4, MUL VL]\n"
+ "st1h { z22.h }, p3, [x20, #5, MUL VL]\n"
+ "st1h { z21.h }, p3, [x20, #6, MUL VL]\n"
+ "st1h { z20.h }, p3, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
- "st1h { z17.h }, p2, [x20, #-4, MUL VL]\n"
- "st1h { z24.h }, p2, [x20, #-3, MUL VL]\n"
- "st1h { z20.h }, p2, [x20, #-2, MUL VL]\n"
- "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+ "st1h { z19.h }, p3, [x20, #-4, MUL VL]\n"
+ "st1h { z18.h }, p3, [x20, #-3, MUL VL]\n"
+ "st1h { z17.h }, p3, [x20, #-2, MUL VL]\n"
+ "st1h { z16.h }, p3, [x20, #-1, MUL VL]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -93,28 +93,28 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
"cbz %x[height], 8f\n"
"4:" // Main loop skip
"5:" // Tail row loop: Head
- "mov x27, %x[in]\n"
- "mov x26, %x[out]\n"
+ "mov x26, %x[in]\n"
+ "add %x[in], x26, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x1\n"
"mov x21, %x[width]\n"
- "add %x[in], x27, %x[in_stride]\n"
"6:" // Tail row loop: Column loop
"mov x20, x21\n"
- "dech x21, ALL, MUL #3\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x26]\n"
"dech x20\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x26, #1, MUL VL]\n"
"dech x20\n"
- "ld1h { z18.h }, p0/Z, [x27]\n"
+ "dech x21, ALL, MUL #3\n"
"whilelt p0.h, XZR, x20\n"
"cmp x21, #0x0\n"
- "ld1h { z17.h }, p1/Z, [x27, #1, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x27, #2, MUL VL]\n"
- "addvl x27, x27, #3\n"
- "st1h { z18.h }, p2, [x26]\n"
- "st1h { z17.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z16.h }, p2, [x26, #2, MUL VL]\n"
- "add x26, x26, %x[out_stride]\n"
+ "ld1h { z16.h }, p0/Z, [x26, #2, MUL VL]\n"
+ "st1h { z18.h }, p3, [x22]\n"
+ "addvl x26, x26, #3\n"
+ "st1h { z17.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z16.h }, p3, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 6b\n"
"7:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -123,7 +123,7 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
"8:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
index e5bfb7bb7c..bc462414be 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
@@ -44,175 +44,175 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"blt 6f\n"
"1:" // Main row loop: Head
"mov x10, %x[in]\n"
- "mov x9, %x[width]\n"
- "cntb x28, ALL, MUL #3\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x10, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
- "cmp x9, x28\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
+ "mov x25, %x[width]\n"
+ "cntb x24, ALL, MUL #3\n"
+ "add x23, x26, %x[in_stride]\n"
+ "add x21, x23, %x[in_stride]\n"
"add x20, x21, %x[in_stride]\n"
+ "cmp x25, x24\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z19.b }, p1/Z, [x10]\n"
- "ld1b { z18.b }, p1/Z, [x26]\n"
- "sub x9, x9, x28\n"
- "ld1b { z17.b }, p1/Z, [x25]\n"
- "ld1b { z16.b }, p1/Z, [x24]\n"
- "cmp x9, x28\n"
- "ld1b { z27.b }, p1/Z, [x23]\n"
- "ld1b { z26.b }, p1/Z, [x22]\n"
- "ld1b { z25.b }, p1/Z, [x21]\n"
- "ld1b { z24.b }, p1/Z, [x20]\n"
- "ld1b { z23.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z22.b }, p1/Z, [x26, #1, MUL VL]\n"
- "zip1 z1.b, z19.b, z17.b\n"
- "zip1 z0.b, z18.b, z16.b\n"
- "ld1b { z21.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x24, #1, MUL VL]\n"
- "zip2 z15.b, z19.b, z17.b\n"
- "zip2 z14.b, z18.b, z16.b\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [x22, #1, MUL VL]\n"
- "zip1 z13.b, z27.b, z25.b\n"
- "zip1 z12.b, z26.b, z24.b\n"
+ "ld1b { z21.b }, p1/Z, [x10]\n"
+ "ld1b { z20.b }, p1/Z, [x9]\n"
+ "sub x25, x25, x24\n"
+ "cmp x25, x24\n"
+ "ld1b { z17.b }, p1/Z, [x28]\n"
+ "ld1b { z16.b }, p1/Z, [x27]\n"
+ "zip1 z31.b, z21.b, z17.b\n"
+ "zip1 z22.b, z20.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
+ "ld1b { z18.b }, p1/Z, [x23]\n"
+ "zip2 z14.b, z21.b, z17.b\n"
+ "zip2 z13.b, z20.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x21]\n"
+ "ld1b { z16.b }, p1/Z, [x20]\n"
+ "zip1 z30.b, z19.b, z17.b\n"
+ "zip1 z29.b, z18.b, z16.b\n"
+ "ld1b { z21.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z12.b, z19.b, z17.b\n"
+ "zip2 z11.b, z18.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip1 z10.b, z21.b, z17.b\n"
+ "zip1 z9.b, z20.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x23, #1, MUL VL]\n"
+ "zip2 z8.b, z21.b, z17.b\n"
+ "zip2 z7.b, z20.b, z16.b\n"
"ld1b { z17.b }, p1/Z, [x21, #1, MUL VL]\n"
"ld1b { z16.b }, p1/Z, [x20, #1, MUL VL]\n"
- "zip2 z11.b, z27.b, z25.b\n"
- "zip2 z10.b, z26.b, z24.b\n"
- "ld1b { z9.b }, p1/Z, [x10, #2, MUL VL]\n"
- "ld1b { z8.b }, p1/Z, [x26, #2, MUL VL]\n"
- "zip1 z7.b, z23.b, z21.b\n"
- "zip1 z6.b, z22.b, z20.b\n"
- "ld1b { z31.b }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1b { z30.b }, p1/Z, [x24, #2, MUL VL]\n"
- "zip2 z5.b, z23.b, z21.b\n"
- "zip2 z4.b, z22.b, z20.b\n"
- "ld1b { z29.b }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1b { z28.b }, p1/Z, [x22, #2, MUL VL]\n"
- "zip1 z27.b, z19.b, z17.b\n"
- "zip1 z26.b, z18.b, z16.b\n"
- "ld1b { z25.b }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1b { z24.b }, p1/Z, [x20, #2, MUL VL]\n"
- "zip2 z23.b, z19.b, z17.b\n"
- "zip2 z22.b, z18.b, z16.b\n"
- "zip1 z3.b, z9.b, z31.b\n"
- "zip1 z2.b, z8.b, z30.b\n"
+ "zip1 z6.b, z19.b, z17.b\n"
+ "zip1 z5.b, z18.b, z16.b\n"
+ "ld1b { z28.b }, p1/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z27.b }, p1/Z, [x9, #2, MUL VL]\n"
+ "zip2 z4.b, z19.b, z17.b\n"
+ "zip2 z3.b, z18.b, z16.b\n"
+ "ld1b { z26.b }, p1/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z25.b }, p1/Z, [x27, #2, MUL VL]\n"
+ "zip1 z2.b, z28.b, z26.b\n"
+ "zip1 z1.b, z27.b, z25.b\n"
+ "ld1b { z24.b }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1b { z23.b }, p1/Z, [x23, #2, MUL VL]\n"
+ "zip1 z16.b, z31.b, z22.b\n"
+ "zip2 z22.b, z31.b, z22.b\n"
+ "ld1b { z21.b }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x20, #2, MUL VL]\n"
+ "zip1 z0.b, z24.b, z21.b\n"
+ "zip1 z31.b, z23.b, z20.b\n"
+ "zip1 z19.b, z14.b, z13.b\n"
+ "zip1 z18.b, z30.b, z29.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
"addvl x10, x10, #3\n"
+ "zip2 z16.b, z30.b, z29.b\n"
+ "zip1 z17.b, z12.b, z11.b\n"
+ "st1b { z22.b }, p1, [x22, #1, MUL VL]\n"
+ "addvl x9, x9, #3\n"
+ "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+ "zip2 z30.b, z28.b, z26.b\n"
+ "zip2 z29.b, z27.b, z25.b\n"
+ "addvl x28, x28, #3\n"
+ "st1b { z18.b }, p1, [x22, #3, MUL VL]\n"
+ "zip2 z28.b, z24.b, z21.b\n"
+ "zip2 z27.b, z23.b, z20.b\n"
+ "addvl x27, x27, #3\n"
+ "st1b { z16.b }, p1, [x22, #4, MUL VL]\n"
+ "zip2 z21.b, z14.b, z13.b\n"
+ "zip1 z16.b, z10.b, z9.b\n"
"addvl x26, x26, #3\n"
- "zip1 z21.b, z1.b, z0.b\n"
- "zip2 z20.b, z1.b, z0.b\n"
- "addvl x25, x25, #3\n"
- "addvl x24, x24, #3\n"
- "zip1 z1.b, z29.b, z25.b\n"
- "zip1 z0.b, z28.b, z24.b\n"
+ "st1b { z17.b }, p1, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z20.b, z10.b, z9.b\n"
+ "zip2 z19.b, z12.b, z11.b\n"
+ "zip1 z18.b, z6.b, z5.b\n"
+ "zip2 z17.b, z6.b, z5.b\n"
+ "st1b { z21.b }, p1, [x22]\n"
"addvl x23, x23, #3\n"
- "addvl x22, x22, #3\n"
- "zip1 z19.b, z15.b, z14.b\n"
- "zip1 z18.b, z13.b, z12.b\n"
+ "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+ "zip1 z16.b, z8.b, z7.b\n"
+ "zip2 z26.b, z8.b, z7.b\n"
"addvl x21, x21, #3\n"
+ "st1b { z20.b }, p1, [x22, #2, MUL VL]\n"
+ "zip1 z25.b, z2.b, z1.b\n"
+ "zip1 z24.b, z4.b, z3.b\n"
"addvl x20, x20, #3\n"
- "zip2 z17.b, z13.b, z12.b\n"
- "zip1 z16.b, z11.b, z10.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "zip2 z31.b, z9.b, z31.b\n"
- "zip2 z30.b, z8.b, z30.b\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "zip2 z29.b, z29.b, z25.b\n"
- "zip2 z28.b, z28.b, z24.b\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "zip2 z21.b, z15.b, z14.b\n"
- "zip1 z20.b, z7.b, z6.b\n"
- "st1b { z17.b }, p1, [x27, #4, MUL VL]\n"
- "zip2 z19.b, z7.b, z6.b\n"
- "zip2 z18.b, z11.b, z10.b\n"
- "st1b { z16.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z17.b, z27.b, z26.b\n"
- "zip2 z16.b, z27.b, z26.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "zip1 z27.b, z5.b, z4.b\n"
- "zip2 z26.b, z5.b, z4.b\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "zip1 z25.b, z3.b, z2.b\n"
- "zip1 z24.b, z23.b, z22.b\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "zip2 z23.b, z23.b, z22.b\n"
- "zip1 z22.b, z1.b, z0.b\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "zip2 z21.b, z3.b, z2.b\n"
- "zip1 z20.b, z31.b, z30.b\n"
- "st1b { z17.b }, p1, [x27, #4, MUL VL]\n"
- "zip2 z19.b, z31.b, z30.b\n"
- "zip2 z18.b, z1.b, z0.b\n"
- "st1b { z16.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z17.b, z29.b, z28.b\n"
- "zip2 z16.b, z29.b, z28.b\n"
- "st1b { z27.b }, p1, [x27]\n"
- "st1b { z26.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z25.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z24.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z23.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z22.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z19.b }, p1, [x22, #3, MUL VL]\n"
+ "zip2 z23.b, z4.b, z3.b\n"
+ "zip1 z22.b, z0.b, z31.b\n"
+ "st1b { z18.b }, p1, [x22, #4, MUL VL]\n"
+ "zip2 z21.b, z2.b, z1.b\n"
+ "zip1 z20.b, z30.b, z29.b\n"
+ "st1b { z17.b }, p1, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z19.b, z30.b, z29.b\n"
+ "zip2 z18.b, z0.b, z31.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "zip1 z17.b, z28.b, z27.b\n"
+ "zip2 z16.b, z28.b, z27.b\n"
+ "st1b { z26.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z25.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z24.b }, p1, [x22, #3, MUL VL]\n"
+ "st1b { z23.b }, p1, [x22, #4, MUL VL]\n"
+ "st1b { z22.b }, p1, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1b { z21.b }, p1, [x22]\n"
+ "st1b { z20.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z18.b }, p1, [x22, #3, MUL VL]\n"
+ "st1b { z17.b }, p1, [x22, #4, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x9, 5f\n"
+ "cbz x25, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x9\n"
- "decw x9, ALL, MUL #3\n"
- "ld1b { z24.b }, p0/Z, [x10]\n"
+ "whilelt p0.b, XZR, x25\n"
+ "ld1b { z19.b }, p0/Z, [x10]\n"
+ "ld1b { z18.b }, p0/Z, [x9]\n"
+ "decw x25, ALL, MUL #3\n"
+ "ld1b { z17.b }, p0/Z, [x28]\n"
+ "ld1b { z16.b }, p0/Z, [x27]\n"
+ "zip1 z26.b, z19.b, z17.b\n"
+ "zip1 z25.b, z18.b, z16.b\n"
+ "ld1b { z21.b }, p0/Z, [x26]\n"
+ "ld1b { z20.b }, p0/Z, [x23]\n"
+ "zip2 z24.b, z19.b, z17.b\n"
+ "zip2 z19.b, z18.b, z16.b\n"
+ "ld1b { z18.b }, p0/Z, [x21]\n"
+ "ld1b { z16.b }, p0/Z, [x20]\n"
+ "zip1 z23.b, z21.b, z18.b\n"
+ "zip1 z17.b, z20.b, z16.b\n"
+ "zip2 z22.b, z21.b, z18.b\n"
+ "zip2 z16.b, z20.b, z16.b\n"
+ "cmp x25, #0x0\n"
"incd x10, ALL, MUL #6\n"
- "ld1b { z23.b }, p0/Z, [x26]\n"
+ "incd x9, ALL, MUL #6\n"
+ "incd x28, ALL, MUL #6\n"
+ "zip1 z21.b, z26.b, z25.b\n"
+ "zip2 z20.b, z26.b, z25.b\n"
+ "incd x27, ALL, MUL #6\n"
"incd x26, ALL, MUL #6\n"
- "ld1b { z19.b }, p0/Z, [x25]\n"
- "incd x25, ALL, MUL #6\n"
- "ld1b { z18.b }, p0/Z, [x24]\n"
- "incd x24, ALL, MUL #6\n"
- "ld1b { z22.b }, p0/Z, [x23]\n"
- "ld1b { z21.b }, p0/Z, [x22]\n"
- "ld1b { z17.b }, p0/Z, [x21]\n"
- "cmp x9, #0x0\n"
+ "zip1 z19.b, z24.b, z19.b\n"
+ "zip1 z18.b, z23.b, z17.b\n"
"incd x23, ALL, MUL #6\n"
- "ld1b { z16.b }, p0/Z, [x20]\n"
- "zip1 z20.b, z24.b, z19.b\n"
- "zip2 z24.b, z24.b, z19.b\n"
- "incd x22, ALL, MUL #6\n"
- "zip1 z19.b, z23.b, z18.b\n"
- "zip2 z18.b, z23.b, z18.b\n"
"incd x21, ALL, MUL #6\n"
- "incd x20, ALL, MUL #6\n"
- "zip1 z23.b, z22.b, z17.b\n"
- "zip2 z22.b, z22.b, z17.b\n"
- "zip1 z17.b, z21.b, z16.b\n"
- "zip2 z16.b, z21.b, z16.b\n"
- "zip1 z21.b, z20.b, z19.b\n"
- "zip2 z20.b, z20.b, z19.b\n"
- "zip1 z19.b, z24.b, z18.b\n"
- "zip1 z18.b, z23.b, z17.b\n"
"zip2 z17.b, z23.b, z17.b\n"
"zip1 z16.b, z22.b, z16.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "incd x20, ALL, MUL #6\n"
+ "st1b { z21.b }, p1, [x22]\n"
+ "st1b { z20.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z18.b }, p1, [x22, #3, MUL VL]\n"
+ "st1b { z17.b }, p1, [x22, #4, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x8\n"
@@ -222,106 +222,106 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x10, %x[in]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cntb x20, ALL, MUL #3\n"
+ "add x27, x28, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x10, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "csel x25, x25, %x[pad_row], GE\n"
+ "add %x[in], x27, %x[in_stride]\n"
+ "csel x27, x27, %x[pad_row], GT\n"
+ "csel x28, x28, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x26, x26, %x[pad_row], GT\n"
+ "csel x9, x9, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1b { z27.b }, p1/Z, [x10]\n"
- "ld1b { z22.b }, p1/Z, [x26]\n"
+ "ld1b { z21.b }, p1/Z, [x10]\n"
+ "ld1b { z20.b }, p1/Z, [x9]\n"
"sub x21, x21, x20\n"
- "ld1b { z21.b }, p1/Z, [x25]\n"
- "ld1b { z17.b }, p1/Z, [x24]\n"
"cmp x21, x20\n"
- "ld1b { z26.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z25.b }, p1/Z, [x26, #1, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z19.b }, p1/Z, [x24, #1, MUL VL]\n"
- "ld1b { z30.b }, p1/Z, [x10, #2, MUL VL]\n"
- "ld1b { z29.b }, p1/Z, [x26, #2, MUL VL]\n"
- "zip1 z18.b, z27.b, z21.b\n"
- "zip1 z16.b, z22.b, z17.b\n"
- "ld1b { z24.b }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1b { z23.b }, p1/Z, [x24, #2, MUL VL]\n"
- "zip2 z28.b, z27.b, z21.b\n"
- "zip2 z17.b, z22.b, z17.b\n"
- "zip1 z22.b, z26.b, z20.b\n"
- "zip1 z21.b, z25.b, z19.b\n"
+ "ld1b { z17.b }, p1/Z, [x28]\n"
+ "ld1b { z16.b }, p1/Z, [x27]\n"
+ "zip1 z31.b, z21.b, z17.b\n"
+ "zip1 z30.b, z20.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z29.b, z21.b, z17.b\n"
+ "zip2 z28.b, z20.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip1 z27.b, z19.b, z17.b\n"
+ "zip1 z26.b, z18.b, z16.b\n"
+ "ld1b { z22.b }, p1/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [x9, #2, MUL VL]\n"
+ "zip2 z25.b, z19.b, z17.b\n"
+ "zip2 z20.b, z18.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x27, #2, MUL VL]\n"
+ "zip1 z24.b, z22.b, z19.b\n"
+ "zip1 z23.b, z21.b, z18.b\n"
+ "zip1 z16.b, z31.b, z30.b\n"
+ "zip2 z17.b, z31.b, z30.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
"addvl x10, x10, #3\n"
- "addvl x26, x26, #3\n"
- "zip2 z27.b, z26.b, z20.b\n"
- "zip2 z20.b, z25.b, z19.b\n"
- "addvl x25, x25, #3\n"
- "addvl x24, x24, #3\n"
- "zip1 z26.b, z30.b, z24.b\n"
- "zip1 z25.b, z29.b, z23.b\n"
- "zip1 z19.b, z18.b, z16.b\n"
- "zip2 z16.b, z18.b, z16.b\n"
- "zip1 z18.b, z28.b, z17.b\n"
- "zip2 z24.b, z30.b, z24.b\n"
- "zip2 z23.b, z29.b, z23.b\n"
- "zip2 z17.b, z28.b, z17.b\n"
- "st1b { z19.b }, p1, [x27]\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "zip1 z16.b, z22.b, z21.b\n"
- "zip2 z22.b, z22.b, z21.b\n"
- "st1b { z18.b }, p1, [x27, #2, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z21.b, z27.b, z20.b\n"
- "zip2 z20.b, z27.b, z20.b\n"
- "st1b { z17.b }, p1, [x27]\n"
- "zip1 z19.b, z26.b, z25.b\n"
- "zip2 z18.b, z26.b, z25.b\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "zip1 z17.b, z24.b, z23.b\n"
- "zip2 z16.b, z24.b, z23.b\n"
- "st1b { z22.b }, p1, [x27, #2, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z18.b }, p1, [x27]\n"
- "st1b { z17.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #2, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 z16.b, z29.b, z28.b\n"
+ "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+ "zip2 z22.b, z22.b, z19.b\n"
+ "addvl x9, x9, #3\n"
+ "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z21.b, z21.b, z18.b\n"
+ "zip2 z18.b, z29.b, z28.b\n"
+ "zip1 z16.b, z27.b, z26.b\n"
+ "zip2 z17.b, z27.b, z26.b\n"
+ "st1b { z18.b }, p1, [x22]\n"
+ "addvl x28, x28, #3\n"
+ "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+ "zip1 z16.b, z25.b, z20.b\n"
+ "zip2 z20.b, z25.b, z20.b\n"
+ "addvl x27, x27, #3\n"
+ "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z19.b, z24.b, z23.b\n"
+ "zip2 z18.b, z24.b, z23.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "zip1 z17.b, z22.b, z21.b\n"
+ "zip2 z16.b, z22.b, z21.b\n"
+ "st1b { z20.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1b { z18.b }, p1, [x22]\n"
+ "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.b, XZR, x21\n"
- "decw x21, ALL, MUL #3\n"
"ld1b { z19.b }, p0/Z, [x10]\n"
- "incd x10, ALL, MUL #6\n"
- "ld1b { z18.b }, p0/Z, [x26]\n"
- "incd x26, ALL, MUL #6\n"
- "ld1b { z17.b }, p0/Z, [x25]\n"
- "incd x25, ALL, MUL #6\n"
- "ld1b { z16.b }, p0/Z, [x24]\n"
- "incd x24, ALL, MUL #6\n"
+ "ld1b { z21.b }, p0/Z, [x9]\n"
+ "decw x21, ALL, MUL #3\n"
+ "ld1b { z18.b }, p0/Z, [x28]\n"
+ "ld1b { z16.b }, p0/Z, [x27]\n"
+ "zip1 z20.b, z19.b, z18.b\n"
+ "zip1 z17.b, z21.b, z16.b\n"
+ "zip2 z19.b, z19.b, z18.b\n"
+ "zip2 z16.b, z21.b, z16.b\n"
"cmp x21, #0x0\n"
- "zip1 z20.b, z19.b, z17.b\n"
- "zip2 z19.b, z19.b, z17.b\n"
- "zip1 z17.b, z18.b, z16.b\n"
- "zip2 z16.b, z18.b, z16.b\n"
+ "incd x10, ALL, MUL #6\n"
+ "incd x9, ALL, MUL #6\n"
+ "incd x28, ALL, MUL #6\n"
"zip1 z18.b, z20.b, z17.b\n"
"zip2 z17.b, z20.b, z17.b\n"
+ "incd x27, ALL, MUL #6\n"
"zip1 z16.b, z19.b, z16.b\n"
- "st1b { z18.b }, p1, [x27]\n"
- "st1b { z17.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #2, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z18.b }, p1, [x22]\n"
+ "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -330,7 +330,7 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"12:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
index 70eb77ebe5..5cf7139fe4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
@@ -44,104 +44,104 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t
"blt 6f\n"
"1:" // Main row loop: Head
"mov x12, %x[in]\n"
- "mov x11, %x[width]\n"
- "cnth x10, ALL, MUL #3\n"
- "mov x9, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x28, x12, %x[in_stride]\n"
- "add x27, x28, %x[in_stride]\n"
- "add x26, x27, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "cmp x11, x10\n"
+ "add x11, x12, %x[in_stride]\n"
+ "add x10, x11, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "mov x27, %x[width]\n"
+ "cnth x26, ALL, MUL #3\n"
+ "add x25, x28, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "cmp x27, x26\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z18.h }, p2/Z, [x12]\n"
- "ld1h { z29.h }, p2/Z, [x12, #1, MUL VL]\n"
- "mov x21, x9\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z17.h }, p2/Z, [x28]\n"
- "ld1h { z16.h }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x20, x9\n"
- "sub x11, x11, x10\n"
- "ld1h { z21.h }, p2/Z, [x27]\n"
- "ld1h { z28.h }, p2/Z, [x27, #1, MUL VL]\n"
- "cmp x11, x10\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z20.h }, p2/Z, [x26]\n"
- "ld1h { z27.h }, p2/Z, [x26, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x25]\n"
- "ld1h { z19.h }, p2/Z, [x24]\n"
- "zip1 z25.h, z18.h, z17.h\n"
- "zip2 z24.h, z18.h, z17.h\n"
- "ld1h { z23.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x28, #2, MUL VL]\n"
- "zip1 z22.h, z29.h, z16.h\n"
- "zip2 z8.h, z29.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x26, #2, MUL VL]\n"
- "zip1 z7.h, z21.h, z20.h\n"
- "zip2 z6.h, z21.h, z20.h\n"
- "ld1h { z21.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z5.h }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z4.h, z28.h, z27.h\n"
- "zip1 z3.h, z26.h, z19.h\n"
- "ld1h { z20.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
- "zip2 z1.h, z26.h, z19.h\n"
- "zip1 z0.h, z23.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x23]\n"
- "ld1h { z31.h }, p2/Z, [x23, #1, MUL VL]\n"
- "zip2 z30.h, z23.h, z18.h\n"
- "zip2 z29.h, z28.h, z27.h\n"
- "ld1h { z28.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x22]\n"
- "zip1 z27.h, z17.h, z16.h\n"
- "zip2 z26.h, z17.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x22, #2, MUL VL]\n"
- "st1h { z25.h }, p2, [x21]\n"
- "zip1 z25.h, z21.h, z20.h\n"
- "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
- "zip2 z24.h, z21.h, z20.h\n"
- "zip1 z23.h, z5.h, z2.h\n"
+ "ld1h { z17.h }, p2/Z, [x12]\n"
+ "ld1h { z23.h }, p2/Z, [x12, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1h { z16.h }, p2/Z, [x11]\n"
+ "ld1h { z20.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "zip1 z9.h, z17.h, z16.h\n"
+ "zip2 z8.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x10]\n"
+ "ld1h { z22.h }, p2/Z, [x10, #1, MUL VL]\n"
+ "zip1 z7.h, z23.h, z20.h\n"
+ "mov x20, x22\n"
+ "ld1h { z16.h }, p2/Z, [x9]\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "zip1 z6.h, z17.h, z16.h\n"
+ "zip2 z5.h, z17.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "ld1h { z17.h }, p2/Z, [x25]\n"
+ "zip1 z4.h, z22.h, z21.h\n"
+ "zip1 z3.h, z18.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "zip2 z2.h, z18.h, z17.h\n"
+ "zip2 z1.h, z23.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "zip1 z0.h, z19.h, z16.h\n"
+ "zip2 z31.h, z19.h, z16.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z30.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "zip2 z29.h, z22.h, z21.h\n"
+ "zip1 z28.h, z18.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z19.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "zip1 z27.h, z20.h, z16.h\n"
+ "zip2 z26.h, z18.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x24]\n"
+ "ld1h { z18.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "zip2 z25.h, z20.h, z16.h\n"
+ "zip1 z24.h, z30.h, z19.h\n"
+ "ld1h { z23.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x23]\n"
+ "zip1 z22.h, z17.h, z16.h\n"
+ "zip2 z21.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "st1h { z9.h }, p2, [x21]\n"
+ "zip1 z20.h, z18.h, z17.h\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "sub x27, x27, x26\n"
+ "cmp x27, x26\n"
+ "zip2 z19.h, z30.h, z19.h\n"
+ "st1h { z7.h }, p2, [x21, #2, MUL VL]\n"
"addvl x12, x12, #3\n"
- "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
- "zip1 z22.h, z19.h, z18.h\n"
- "zip2 z21.h, z19.h, z18.h\n"
+ "addvl x11, x11, #3\n"
+ "zip2 z18.h, z18.h, z17.h\n"
+ "st1h { z6.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x10, x10, #3\n"
+ "addvl x9, x9, #3\n"
+ "zip1 z17.h, z23.h, z16.h\n"
+ "st1h { z5.h }, p2, [x21, #4, MUL VL]\n"
"addvl x28, x28, #3\n"
- "st1h { z7.h }, p2, [x21, #3, MUL VL]\n"
- "zip1 z20.h, z31.h, z17.h\n"
- "addvl x27, x27, #3\n"
- "addvl x26, x26, #3\n"
- "st1h { z6.h }, p2, [x21, #4, MUL VL]\n"
"addvl x25, x25, #3\n"
- "addvl x24, x24, #3\n"
- "zip2 z19.h, z5.h, z2.h\n"
+ "zip2 z16.h, z23.h, z16.h\n"
"st1h { z4.h }, p2, [x21, #5, MUL VL]\n"
+ "addvl x24, x24, #3\n"
"addvl x23, x23, #3\n"
- "addvl x22, x22, #3\n"
- "zip2 z18.h, z31.h, z17.h\n"
"st1h { z3.h }, p2, [x21, #6, MUL VL]\n"
- "zip1 z17.h, z28.h, z16.h\n"
- "zip2 z16.h, z28.h, z16.h\n"
- "st1h { z1.h }, p2, [x21, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z2.h }, p2, [x21, #7, MUL VL]\n"
"addvl x21, x21, #12\n"
- "st1h { z25.h }, p2, [x21, #-4, MUL VL]\n"
+ "st1h { z27.h }, p2, [x21, #-4, MUL VL]\n"
"st1h { z22.h }, p2, [x21, #-3, MUL VL]\n"
"st1h { z21.h }, p2, [x21, #-2, MUL VL]\n"
"st1h { z20.h }, p2, [x21, #-1, MUL VL]\n"
- "st1h { z8.h }, p2, [x20]\n"
+ "st1h { z1.h }, p2, [x20]\n"
"st1h { z0.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z30.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z31.h }, p2, [x20, #2, MUL VL]\n"
"st1h { z29.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z27.h }, p2, [x20, #4, MUL VL]\n"
+ "st1h { z28.h }, p2, [x20, #4, MUL VL]\n"
"st1h { z26.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z24.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z23.h }, p2, [x20, #7, MUL VL]\n"
+ "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
+ "st1h { z24.h }, p2, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
"st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
"st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
@@ -149,67 +149,67 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t
"st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x11, 5f\n"
+ "cbz x27, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x11\n"
- "mov x20, x9\n"
- "decw x11, ALL, MUL #3\n"
- "add x9, x9, %x[out_stride]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x11, #0x0\n"
- "ld1h { z19.h }, p1/Z, [x12]\n"
- "ld1h { z18.h }, p1/Z, [x28]\n"
- "ld1h { z0.h }, p1/Z, [x27]\n"
- "ld1h { z31.h }, p1/Z, [x26]\n"
- "ld1h { z30.h }, p1/Z, [x25]\n"
- "ld1h { z29.h }, p1/Z, [x24]\n"
- "ld1h { z17.h }, p0/Z, [x12, #1, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "mov x20, x27\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z0.h }, p1/Z, [x12]\n"
+ "ld1h { z16.h }, p1/Z, [x11]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z21.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z31.h }, p1/Z, [x10]\n"
+ "ld1h { z30.h }, p0/Z, [x10, #1, MUL VL]\n"
+ "mov x20, x22\n"
+ "decw x27, ALL, MUL #3\n"
+ "ld1h { z18.h }, p1/Z, [x9]\n"
+ "ld1h { z29.h }, p0/Z, [x9, #1, MUL VL]\n"
"addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "ld1h { z28.h }, p1/Z, [x28]\n"
+ "ld1h { z20.h }, p1/Z, [x25]\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
+ "ld1h { z27.h }, p0/Z, [x28, #1, MUL VL]\n"
"addvl x28, x28, #1\n"
- "ld1h { z22.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z21.h }, p0/Z, [x26, #1, MUL VL]\n"
- "addvl x27, x27, #1\n"
- "addvl x26, x26, #1\n"
- "ld1h { z28.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z26.h }, p0/Z, [x25, #1, MUL VL]\n"
"addvl x25, x25, #1\n"
- "ld1h { z27.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z25.h }, p1/Z, [x24]\n"
+ "ld1h { z24.h }, p0/Z, [x24, #1, MUL VL]\n"
"addvl x24, x24, #1\n"
- "ld1h { z26.h }, p1/Z, [x23]\n"
- "ld1h { z25.h }, p0/Z, [x23, #1, MUL VL]\n"
+ "zip1 z17.h, z0.h, z16.h\n"
+ "ld1h { z23.h }, p1/Z, [x23]\n"
+ "ld1h { z22.h }, p0/Z, [x23, #1, MUL VL]\n"
"addvl x23, x23, #1\n"
- "zip1 z20.h, z19.h, z18.h\n"
- "ld1h { z24.h }, p1/Z, [x22]\n"
- "ld1h { z23.h }, p0/Z, [x22, #1, MUL VL]\n"
- "addvl x22, x22, #1\n"
- "zip2 z19.h, z19.h, z18.h\n"
- "zip1 z18.h, z17.h, z16.h\n"
- "zip1 z17.h, z0.h, z31.h\n"
+ "zip2 z16.h, z0.h, z16.h\n"
+ "zip1 z21.h, z21.h, z19.h\n"
+ "zip1 z19.h, z31.h, z18.h\n"
+ "st1h { z17.h }, p2, [x20]\n"
+ "cmp x27, #0x0\n"
+ "zip2 z18.h, z31.h, z18.h\n"
+ "zip1 z17.h, z30.h, z29.h\n"
+ "st1h { z16.h }, p2, [x20, #1, MUL VL]\n"
"incd x12, ALL, MUL #4\n"
+ "zip1 z16.h, z28.h, z20.h\n"
+ "zip2 z20.h, z28.h, z20.h\n"
+ "st1h { z21.h }, p2, [x20, #2, MUL VL]\n"
+ "incd x11, ALL, MUL #4\n"
+ "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+ "incd x10, ALL, MUL #4\n"
+ "incd x9, ALL, MUL #4\n"
+ "zip1 z19.h, z27.h, z26.h\n"
+ "st1h { z18.h }, p2, [x20, #4, MUL VL]\n"
"incd x28, ALL, MUL #4\n"
- "zip2 z16.h, z0.h, z31.h\n"
- "zip1 z22.h, z22.h, z21.h\n"
- "st1h { z20.h }, p2, [x20]\n"
- "incd x27, ALL, MUL #4\n"
- "zip1 z21.h, z30.h, z29.h\n"
- "zip2 z20.h, z30.h, z29.h\n"
- "st1h { z19.h }, p2, [x20, #1, MUL VL]\n"
- "incd x26, ALL, MUL #4\n"
- "st1h { z18.h }, p2, [x20, #2, MUL VL]\n"
"incd x25, ALL, MUL #4\n"
+ "zip1 z18.h, z25.h, z23.h\n"
+ "st1h { z17.h }, p2, [x20, #5, MUL VL]\n"
"incd x24, ALL, MUL #4\n"
- "zip1 z19.h, z28.h, z27.h\n"
- "st1h { z17.h }, p2, [x20, #3, MUL VL]\n"
"incd x23, ALL, MUL #4\n"
- "incd x22, ALL, MUL #4\n"
- "zip1 z18.h, z26.h, z24.h\n"
- "st1h { z16.h }, p2, [x20, #4, MUL VL]\n"
- "zip2 z17.h, z26.h, z24.h\n"
- "zip1 z16.h, z25.h, z23.h\n"
- "st1h { z22.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #6, MUL VL]\n"
+ "zip2 z17.h, z25.h, z23.h\n"
+ "st1h { z16.h }, p2, [x20, #6, MUL VL]\n"
+ "zip1 z16.h, z24.h, z22.h\n"
+ "add x22, x22, %x[out_stride]\n"
"st1h { z20.h }, p2, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
"st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
@@ -227,64 +227,64 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t
"mov x12, %x[in]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #3\n"
+ "add x11, x12, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x9, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x28, x12, %x[in_stride]\n"
- "add %x[in], x28, %x[in_stride]\n"
- "csel x28, x28, %x[pad_row], GT\n"
+ "add %x[in], x11, %x[in_stride]\n"
+ "csel x11, x11, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1h { z19.h }, p2/Z, [x12]\n"
- "ld1h { z23.h }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x12]\n"
+ "ld1h { z22.h }, p2/Z, [x12, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1h { z18.h }, p2/Z, [x28]\n"
- "ld1h { z17.h }, p2/Z, [x28, #1, MUL VL]\n"
"cmp x21, x20\n"
- "ld1h { z22.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x11]\n"
+ "ld1h { z21.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "zip1 z18.h, z17.h, z16.h\n"
+ "zip2 z17.h, z17.h, z16.h\n"
+ "ld1h { z20.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z19.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "zip1 z16.h, z22.h, z21.h\n"
+ "st1h { z18.h }, p2, [x22]\n"
+ "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
"addvl x12, x12, #3\n"
- "addvl x28, x28, #3\n"
- "zip1 z21.h, z19.h, z18.h\n"
- "zip2 z20.h, z19.h, z18.h\n"
- "zip1 z19.h, z23.h, z17.h\n"
- "zip2 z18.h, z23.h, z17.h\n"
- "zip1 z17.h, z22.h, z16.h\n"
- "zip2 z16.h, z22.h, z16.h\n"
- "st1h { z21.h }, p2, [x9]\n"
- "st1h { z20.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z19.h }, p2, [x9, #2, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
- "st1h { z18.h }, p2, [x9]\n"
- "st1h { z17.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #2, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "addvl x11, x11, #3\n"
+ "zip2 z18.h, z22.h, z21.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z17.h, z20.h, z19.h\n"
+ "zip2 z16.h, z20.h, z19.h\n"
+ "st1h { z18.h }, p2, [x22]\n"
+ "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #3\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z20.h }, p0/Z, [x12]\n"
+ "ld1h { z17.h }, p0/Z, [x11]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
- "cmp x21, #0x0\n"
- "ld1h { z20.h }, p1/Z, [x12]\n"
- "ld1h { z17.h }, p1/Z, [x28]\n"
"ld1h { z19.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "decw x21, ALL, MUL #3\n"
"addvl x12, x12, #1\n"
- "ld1h { z16.h }, p0/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #1\n"
"zip1 z18.h, z20.h, z17.h\n"
"zip2 z17.h, z20.h, z17.h\n"
- "incd x12, ALL, MUL #4\n"
- "incd x28, ALL, MUL #4\n"
+ "addvl x11, x11, #1\n"
+ "cmp x21, #0x0\n"
"zip1 z16.h, z19.h, z16.h\n"
- "st1h { z18.h }, p2, [x9]\n"
- "st1h { z17.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #2, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "st1h { z18.h }, p2, [x22]\n"
+ "incd x12, ALL, MUL #4\n"
+ "incd x11, ALL, MUL #4\n"
+ "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -293,7 +293,7 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t
"12:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
index 539b4946b4..ae2ae8c310 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
@@ -34,67 +34,67 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "ptrue p2.b\n"
+ "ptrue p4.b\n"
"blt 4f\n"
"1:" // Main row loop: Head
- "mov x27, %x[in]\n"
- "mov x26, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "mov x25, %x[width]\n"
- "add x24, x27, %x[in_stride]\n"
+ "mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
"2:" // Main row loop: Column loop
- "mov x21, x25\n"
- "mov x20, x26\n"
- "dech x25, ALL, MUL #4\n"
- "add x26, x26, %x[out_stride]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z31.h }, p1/Z, [x27]\n"
- "ld1h { z30.h }, p1/Z, [x24]\n"
- "ld1h { z29.h }, p1/Z, [x23]\n"
- "ld1h { z28.h }, p1/Z, [x22]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z27.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z26.h }, p0/Z, [x24, #1, MUL VL]\n"
- "ld1h { z25.h }, p0/Z, [x23, #1, MUL VL]\n"
- "ld1h { z24.h }, p0/Z, [x22, #1, MUL VL]\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x25, #0x0\n"
- "ld1h { z23.h }, p1/Z, [x27, #2, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z20.h }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x27, #3, MUL VL]\n"
- "ld1h { z18.h }, p0/Z, [x24, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
+ "mov x20, x21\n"
+ "whilelt p3.h, XZR, x20\n"
+ "ld1h { z31.h }, p3/Z, [x26]\n"
+ "ld1h { z30.h }, p3/Z, [x25]\n"
+ "dech x20\n"
+ "whilelt p2.h, XZR, x20\n"
+ "ld1h { z29.h }, p2/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z28.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z27.h }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1h { z26.h }, p1/Z, [x25, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z25.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "ld1h { z24.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "mov x20, x22\n"
+ "dech x21, ALL, MUL #4\n"
+ "ld1h { z23.h }, p3/Z, [x24]\n"
+ "ld1h { z22.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z21.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z20.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "cmp x21, #0x0\n"
+ "addvl x26, x26, #4\n"
+ "ld1h { z19.h }, p3/Z, [x23]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "addvl x25, x25, #4\n"
"addvl x24, x24, #4\n"
- "ld1h { z17.h }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x22, #3, MUL VL]\n"
- "st1h { z31.h }, p2, [x20]\n"
+ "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "st1h { z31.h }, p4, [x20]\n"
"addvl x23, x23, #4\n"
- "st1h { z27.h }, p2, [x20, #1, MUL VL]\n"
- "addvl x22, x22, #4\n"
- "st1h { z23.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z30.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z26.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z22.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z18.h }, p2, [x20, #7, MUL VL]\n"
+ "st1h { z29.h }, p4, [x20, #1, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z27.h }, p4, [x20, #2, MUL VL]\n"
+ "st1h { z25.h }, p4, [x20, #3, MUL VL]\n"
+ "st1h { z30.h }, p4, [x20, #4, MUL VL]\n"
+ "st1h { z28.h }, p4, [x20, #5, MUL VL]\n"
+ "st1h { z26.h }, p4, [x20, #6, MUL VL]\n"
+ "st1h { z24.h }, p4, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1h { z29.h }, p2, [x20, #-8, MUL VL]\n"
- "st1h { z25.h }, p2, [x20, #-7, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
- "st1h { z17.h }, p2, [x20, #-5, MUL VL]\n"
- "st1h { z28.h }, p2, [x20, #-4, MUL VL]\n"
- "st1h { z24.h }, p2, [x20, #-3, MUL VL]\n"
- "st1h { z20.h }, p2, [x20, #-2, MUL VL]\n"
- "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+ "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+ "st1h { z22.h }, p4, [x20, #-7, MUL VL]\n"
+ "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+ "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+ "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+ "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+ "st1h { z16.h }, p4, [x20, #-1, MUL VL]\n"
"bgt 2b\n"
"3:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -103,32 +103,32 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"cbz %x[height], 8f\n"
"4:" // Main loop skip
"5:" // Tail row loop: Head
- "mov x27, %x[in]\n"
- "mov x26, %x[out]\n"
+ "mov x26, %x[in]\n"
+ "add %x[in], x26, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x1\n"
"mov x21, %x[width]\n"
- "add %x[in], x27, %x[in_stride]\n"
"6:" // Tail row loop: Column loop
"mov x20, x21\n"
- "dech x21, ALL, MUL #4\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z19.h }, p0/Z, [x26]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
"dech x20\n"
- "ld1h { z19.h }, p1/Z, [x27]\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x26, #2, MUL VL]\n"
"dech x20\n"
- "ld1h { z18.h }, p0/Z, [x27, #1, MUL VL]\n"
+ "dech x21, ALL, MUL #4\n"
"whilelt p0.h, XZR, x20\n"
"cmp x21, #0x0\n"
- "ld1h { z17.h }, p1/Z, [x27, #2, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1h { z19.h }, p2, [x26]\n"
- "st1h { z18.h }, p2, [x26, #1, MUL VL]\n"
- "st1h { z17.h }, p2, [x26, #2, MUL VL]\n"
- "st1h { z16.h }, p2, [x26, #3, MUL VL]\n"
- "add x26, x26, %x[out_stride]\n"
+ "ld1h { z16.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22]\n"
+ "addvl x26, x26, #4\n"
+ "st1h { z18.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z16.h }, p4, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 6b\n"
"7:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -137,7 +137,7 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"8:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
index 5f0b4ea8d6..e87c602b54 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
@@ -44,145 +44,145 @@ void sve_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"blt 6f\n"
"1:" // Main row loop: Head
"mov x10, %x[in]\n"
- "mov x9, %x[width]\n"
- "cntb x28, ALL, MUL #2\n"
- "mov x27, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x26, x10, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add x23, x24, %x[in_stride]\n"
- "cmp x9, x28\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
+ "mov x25, %x[width]\n"
+ "cntb x24, ALL, MUL #2\n"
+ "add x23, x26, %x[in_stride]\n"
+ "add x21, x23, %x[in_stride]\n"
"add x20, x21, %x[in_stride]\n"
+ "cmp x25, x24\n"
"add %x[in], x20, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z19.b }, p1/Z, [x10]\n"
- "ld1b { z18.b }, p1/Z, [x26]\n"
- "sub x9, x9, x28\n"
- "ld1b { z17.b }, p1/Z, [x25]\n"
- "ld1b { z16.b }, p1/Z, [x24]\n"
- "cmp x9, x28\n"
- "ld1b { z24.b }, p1/Z, [x23]\n"
- "ld1b { z23.b }, p1/Z, [x22]\n"
- "ld1b { z22.b }, p1/Z, [x21]\n"
- "ld1b { z21.b }, p1/Z, [x20]\n"
- "ld1b { z29.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z28.b }, p1/Z, [x26, #1, MUL VL]\n"
- "zip1 z4.b, z19.b, z17.b\n"
- "zip1 z3.b, z18.b, z16.b\n"
- "ld1b { z27.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x24, #1, MUL VL]\n"
- "zip2 z2.b, z19.b, z17.b\n"
- "zip2 z1.b, z18.b, z16.b\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [x22, #1, MUL VL]\n"
- "zip1 z26.b, z24.b, z22.b\n"
- "zip1 z25.b, z23.b, z21.b\n"
+ "ld1b { z21.b }, p1/Z, [x10]\n"
+ "ld1b { z20.b }, p1/Z, [x9]\n"
+ "sub x25, x25, x24\n"
+ "cmp x25, x24\n"
+ "ld1b { z17.b }, p1/Z, [x28]\n"
+ "ld1b { z16.b }, p1/Z, [x27]\n"
+ "zip1 z4.b, z21.b, z17.b\n"
+ "zip1 z3.b, z20.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
+ "ld1b { z18.b }, p1/Z, [x23]\n"
+ "zip2 z2.b, z21.b, z17.b\n"
+ "zip2 z1.b, z20.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x21]\n"
+ "ld1b { z16.b }, p1/Z, [x20]\n"
+ "zip1 z0.b, z19.b, z17.b\n"
+ "zip1 z31.b, z18.b, z16.b\n"
+ "ld1b { z24.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z30.b, z19.b, z17.b\n"
+ "zip2 z23.b, z18.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip1 z22.b, z24.b, z17.b\n"
+ "zip1 z21.b, z20.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x23, #1, MUL VL]\n"
+ "zip2 z29.b, z24.b, z17.b\n"
+ "zip2 z28.b, z20.b, z16.b\n"
"ld1b { z17.b }, p1/Z, [x21, #1, MUL VL]\n"
"ld1b { z16.b }, p1/Z, [x20, #1, MUL VL]\n"
- "zip2 z24.b, z24.b, z22.b\n"
- "zip2 z23.b, z23.b, z21.b\n"
- "zip1 z22.b, z29.b, z27.b\n"
- "zip1 z21.b, z28.b, z20.b\n"
+ "zip1 z27.b, z19.b, z17.b\n"
+ "zip1 z26.b, z18.b, z16.b\n"
+ "zip2 z25.b, z19.b, z17.b\n"
+ "zip2 z24.b, z18.b, z16.b\n"
"addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "zip1 z16.b, z4.b, z3.b\n"
+ "zip2 z17.b, z4.b, z3.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "addvl x28, x28, #2\n"
+ "zip1 z16.b, z2.b, z1.b\n"
+ "zip2 z20.b, z2.b, z1.b\n"
+ "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+ "addvl x27, x27, #2\n"
+ "zip1 z19.b, z0.b, z31.b\n"
+ "zip2 z18.b, z0.b, z31.b\n"
+ "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
"addvl x26, x26, #2\n"
- "zip2 z0.b, z29.b, z27.b\n"
- "zip2 z31.b, z28.b, z20.b\n"
- "addvl x25, x25, #2\n"
- "addvl x24, x24, #2\n"
- "zip1 z30.b, z19.b, z17.b\n"
- "zip1 z29.b, z18.b, z16.b\n"
+ "zip1 z17.b, z30.b, z23.b\n"
+ "zip2 z16.b, z30.b, z23.b\n"
+ "st1b { z20.b }, p1, [x22, #3, MUL VL]\n"
"addvl x23, x23, #2\n"
- "addvl x22, x22, #2\n"
- "zip2 z28.b, z19.b, z17.b\n"
- "zip2 z27.b, z18.b, z16.b\n"
+ "st1b { z19.b }, p1, [x22, #4, MUL VL]\n"
"addvl x21, x21, #2\n"
"addvl x20, x20, #2\n"
- "zip1 z20.b, z4.b, z3.b\n"
- "zip2 z19.b, z4.b, z3.b\n"
- "zip1 z18.b, z2.b, z1.b\n"
- "zip2 z17.b, z2.b, z1.b\n"
- "zip1 z16.b, z26.b, z25.b\n"
- "zip2 z26.b, z26.b, z25.b\n"
- "zip1 z25.b, z24.b, z23.b\n"
- "zip2 z24.b, z24.b, z23.b\n"
- "st1b { z20.b }, p1, [x27]\n"
- "st1b { z19.b }, p1, [x27, #1, MUL VL]\n"
"zip1 z23.b, z22.b, z21.b\n"
+ "st1b { z18.b }, p1, [x22, #5, MUL VL]\n"
"zip2 z22.b, z22.b, z21.b\n"
- "st1b { z18.b }, p1, [x27, #2, MUL VL]\n"
- "zip1 z21.b, z0.b, z31.b\n"
- "zip2 z20.b, z0.b, z31.b\n"
- "st1b { z17.b }, p1, [x27, #3, MUL VL]\n"
- "zip1 z19.b, z30.b, z29.b\n"
- "zip2 z18.b, z30.b, z29.b\n"
- "st1b { z16.b }, p1, [x27, #4, MUL VL]\n"
- "zip1 z17.b, z28.b, z27.b\n"
- "zip2 z16.b, z28.b, z27.b\n"
- "st1b { z26.b }, p1, [x27, #5, MUL VL]\n"
- "st1b { z25.b }, p1, [x27, #6, MUL VL]\n"
- "st1b { z24.b }, p1, [x27, #7, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z23.b }, p1, [x27]\n"
- "st1b { z22.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z21.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #5, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #6, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #7, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 z21.b, z29.b, z28.b\n"
+ "st1b { z17.b }, p1, [x22, #6, MUL VL]\n"
+ "zip2 z20.b, z29.b, z28.b\n"
+ "zip1 z19.b, z27.b, z26.b\n"
+ "st1b { z16.b }, p1, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z18.b, z27.b, z26.b\n"
+ "zip1 z17.b, z25.b, z24.b\n"
+ "zip2 z16.b, z25.b, z24.b\n"
+ "st1b { z23.b }, p1, [x22]\n"
+ "st1b { z22.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z21.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z20.b }, p1, [x22, #3, MUL VL]\n"
+ "st1b { z19.b }, p1, [x22, #4, MUL VL]\n"
+ "st1b { z18.b }, p1, [x22, #5, MUL VL]\n"
+ "st1b { z17.b }, p1, [x22, #6, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x9, 5f\n"
+ "cbz x25, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x9\n"
- "decw x9, ALL, MUL #4\n"
- "ld1b { z20.b }, p0/Z, [x10]\n"
+ "whilelt p0.b, XZR, x25\n"
+ "ld1b { z19.b }, p0/Z, [x10]\n"
+ "ld1b { z18.b }, p0/Z, [x9]\n"
+ "decw x25, ALL, MUL #4\n"
+ "ld1b { z17.b }, p0/Z, [x28]\n"
+ "ld1b { z16.b }, p0/Z, [x27]\n"
+ "zip1 z27.b, z19.b, z17.b\n"
+ "zip1 z26.b, z18.b, z16.b\n"
+ "ld1b { z22.b }, p0/Z, [x26]\n"
+ "ld1b { z21.b }, p0/Z, [x23]\n"
+ "zip2 z25.b, z19.b, z17.b\n"
+ "zip2 z20.b, z18.b, z16.b\n"
+ "ld1b { z19.b }, p0/Z, [x21]\n"
+ "ld1b { z16.b }, p0/Z, [x20]\n"
+ "zip1 z18.b, z22.b, z19.b\n"
+ "zip1 z17.b, z21.b, z16.b\n"
+ "zip2 z24.b, z22.b, z19.b\n"
+ "zip2 z16.b, z21.b, z16.b\n"
+ "cmp x25, #0x0\n"
"addvl x10, x10, #1\n"
- "ld1b { z24.b }, p0/Z, [x26]\n"
+ "addvl x9, x9, #1\n"
+ "addvl x28, x28, #1\n"
+ "zip1 z23.b, z27.b, z26.b\n"
+ "zip2 z22.b, z27.b, z26.b\n"
+ "addvl x27, x27, #1\n"
"addvl x26, x26, #1\n"
- "ld1b { z19.b }, p0/Z, [x25]\n"
- "addvl x25, x25, #1\n"
- "ld1b { z18.b }, p0/Z, [x24]\n"
- "addvl x24, x24, #1\n"
- "ld1b { z23.b }, p0/Z, [x23]\n"
- "ld1b { z22.b }, p0/Z, [x22]\n"
- "ld1b { z17.b }, p0/Z, [x21]\n"
- "cmp x9, #0x0\n"
+ "zip1 z21.b, z25.b, z20.b\n"
+ "zip2 z20.b, z25.b, z20.b\n"
"addvl x23, x23, #1\n"
- "ld1b { z16.b }, p0/Z, [x20]\n"
- "zip1 z21.b, z20.b, z19.b\n"
- "zip2 z25.b, z20.b, z19.b\n"
- "addvl x22, x22, #1\n"
- "zip1 z20.b, z24.b, z18.b\n"
- "zip2 z19.b, z24.b, z18.b\n"
"addvl x21, x21, #1\n"
- "addvl x20, x20, #1\n"
- "zip1 z18.b, z23.b, z17.b\n"
- "zip2 z24.b, z23.b, z17.b\n"
- "zip1 z17.b, z22.b, z16.b\n"
- "zip2 z16.b, z22.b, z16.b\n"
- "zip1 z23.b, z21.b, z20.b\n"
- "zip2 z22.b, z21.b, z20.b\n"
- "zip1 z21.b, z25.b, z19.b\n"
- "zip2 z20.b, z25.b, z19.b\n"
"zip1 z19.b, z18.b, z17.b\n"
"zip2 z18.b, z18.b, z17.b\n"
+ "addvl x20, x20, #1\n"
"zip1 z17.b, z24.b, z16.b\n"
"zip2 z16.b, z24.b, z16.b\n"
- "st1b { z23.b }, p1, [x27]\n"
- "st1b { z22.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z21.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #5, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #6, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #7, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z23.b }, p1, [x22]\n"
+ "st1b { z22.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z21.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z20.b }, p1, [x22, #3, MUL VL]\n"
+ "st1b { z19.b }, p1, [x22, #4, MUL VL]\n"
+ "st1b { z18.b }, p1, [x22, #5, MUL VL]\n"
+ "st1b { z17.b }, p1, [x22, #6, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x8\n"
@@ -192,90 +192,90 @@ void sve_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x10, %x[in]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cntb x20, ALL, MUL #2\n"
+ "add x27, x28, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x27, %x[out]\n"
- "add x26, x10, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "add x24, x25, %x[in_stride]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "csel x25, x25, %x[pad_row], GE\n"
+ "add %x[in], x27, %x[in_stride]\n"
+ "csel x27, x27, %x[pad_row], GT\n"
+ "csel x28, x28, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x26, x26, %x[pad_row], GT\n"
+ "csel x9, x9, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1b { z20.b }, p1/Z, [x10]\n"
- "ld1b { z24.b }, p1/Z, [x26]\n"
+ "ld1b { z21.b }, p1/Z, [x10]\n"
+ "ld1b { z19.b }, p1/Z, [x9]\n"
"sub x21, x21, x20\n"
- "ld1b { z19.b }, p1/Z, [x25]\n"
- "ld1b { z18.b }, p1/Z, [x24]\n"
"cmp x21, x20\n"
- "ld1b { z23.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z25.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z17.b }, p1/Z, [x28]\n"
+ "ld1b { z16.b }, p1/Z, [x27]\n"
+ "zip1 z26.b, z21.b, z17.b\n"
+ "zip1 z25.b, z19.b, z16.b\n"
+ "ld1b { z20.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z24.b, z21.b, z17.b\n"
+ "zip2 z19.b, z19.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip1 z23.b, z20.b, z17.b\n"
+ "zip1 z22.b, z18.b, z16.b\n"
+ "zip2 z21.b, z20.b, z17.b\n"
+ "zip2 z20.b, z18.b, z16.b\n"
"addvl x10, x10, #2\n"
- "addvl x26, x26, #2\n"
- "ld1b { z22.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n"
- "addvl x25, x25, #2\n"
- "addvl x24, x24, #2\n"
- "zip1 z21.b, z20.b, z19.b\n"
- "zip1 z16.b, z24.b, z18.b\n"
- "zip2 z20.b, z20.b, z19.b\n"
- "zip2 z19.b, z24.b, z18.b\n"
- "zip1 z24.b, z23.b, z22.b\n"
- "zip1 z18.b, z25.b, z17.b\n"
- "zip2 z23.b, z23.b, z22.b\n"
- "zip2 z22.b, z25.b, z17.b\n"
- "zip1 z17.b, z21.b, z16.b\n"
- "zip2 z16.b, z21.b, z16.b\n"
- "zip1 z21.b, z20.b, z19.b\n"
- "zip2 z20.b, z20.b, z19.b\n"
- "zip1 z19.b, z24.b, z18.b\n"
- "zip2 z18.b, z24.b, z18.b\n"
- "st1b { z17.b }, p1, [x27]\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "zip1 z17.b, z23.b, z22.b\n"
- "zip2 z16.b, z23.b, z22.b\n"
- "st1b { z21.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x27, #3, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z19.b }, p1, [x27]\n"
- "st1b { z18.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #3, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "addvl x9, x9, #2\n"
+ "zip1 z16.b, z26.b, z25.b\n"
+ "zip2 z18.b, z26.b, z25.b\n"
+ "st1b { z16.b }, p1, [x22]\n"
+ "addvl x28, x28, #2\n"
+ "zip1 z17.b, z24.b, z19.b\n"
+ "zip2 z16.b, z24.b, z19.b\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "addvl x27, x27, #2\n"
+ "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+ "zip1 z19.b, z23.b, z22.b\n"
+ "zip2 z18.b, z23.b, z22.b\n"
+ "st1b { z16.b }, p1, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z17.b, z21.b, z20.b\n"
+ "zip2 z16.b, z21.b, z20.b\n"
+ "st1b { z19.b }, p1, [x22]\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"whilelt p0.b, XZR, x21\n"
- "decw x21, ALL, MUL #4\n"
"ld1b { z20.b }, p0/Z, [x10]\n"
- "addvl x10, x10, #1\n"
- "ld1b { z19.b }, p0/Z, [x26]\n"
- "addvl x26, x26, #1\n"
- "ld1b { z17.b }, p0/Z, [x25]\n"
- "addvl x25, x25, #1\n"
- "ld1b { z16.b }, p0/Z, [x24]\n"
- "addvl x24, x24, #1\n"
+ "ld1b { z21.b }, p0/Z, [x9]\n"
+ "decw x21, ALL, MUL #4\n"
+ "ld1b { z19.b }, p0/Z, [x28]\n"
+ "ld1b { z16.b }, p0/Z, [x27]\n"
+ "zip1 z18.b, z20.b, z19.b\n"
+ "zip1 z17.b, z21.b, z16.b\n"
+ "zip2 z20.b, z20.b, z19.b\n"
+ "zip2 z16.b, z21.b, z16.b\n"
"cmp x21, #0x0\n"
- "zip1 z18.b, z20.b, z17.b\n"
- "zip2 z20.b, z20.b, z17.b\n"
- "zip1 z17.b, z19.b, z16.b\n"
- "zip2 z16.b, z19.b, z16.b\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
+ "addvl x28, x28, #1\n"
"zip1 z19.b, z18.b, z17.b\n"
"zip2 z18.b, z18.b, z17.b\n"
+ "addvl x27, x27, #1\n"
"zip1 z17.b, z20.b, z16.b\n"
"zip2 z16.b, z20.b, z16.b\n"
- "st1b { z19.b }, p1, [x27]\n"
- "st1b { z18.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #3, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z19.b }, p1, [x22]\n"
+ "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+ "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+ "st1b { z16.b }, p1, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
index 56d1c9accc..b753c22b67 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
@@ -44,126 +44,126 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"blt 6f\n"
"1:" // Main row loop: Head
"mov x12, %x[in]\n"
- "mov x11, %x[width]\n"
- "cnth x10, ALL, MUL #4\n"
- "mov x9, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x28, x12, %x[in_stride]\n"
- "add x27, x28, %x[in_stride]\n"
- "add x26, x27, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "cmp x11, x10\n"
+ "add x11, x12, %x[in_stride]\n"
+ "add x10, x11, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "mov x27, %x[width]\n"
+ "cnth x26, ALL, MUL #4\n"
+ "add x25, x28, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "cmp x27, x26\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z22.h }, p2/Z, [x12]\n"
- "ld1h { z28.h }, p2/Z, [x12, #1, MUL VL]\n"
- "mov x21, x9\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z26.h }, p2/Z, [x28]\n"
- "ld1h { z17.h }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x20, x9\n"
- "sub x11, x11, x10\n"
- "ld1h { z13.h }, p2/Z, [x27]\n"
- "ld1h { z8.h }, p2/Z, [x27, #1, MUL VL]\n"
- "cmp x11, x10\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z30.h }, p2/Z, [x26]\n"
- "ld1h { z31.h }, p2/Z, [x26, #1, MUL VL]\n"
- "ld1h { z15.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z0.h }, p2/Z, [x12, #3, MUL VL]\n"
- "zip1 z27.h, z22.h, z26.h\n"
- "zip2 z26.h, z22.h, z26.h\n"
- "ld1h { z4.h }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x28, #3, MUL VL]\n"
- "zip1 z24.h, z28.h, z17.h\n"
- "zip2 z19.h, z28.h, z17.h\n"
- "ld1h { z25.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z21.h }, p2/Z, [x27, #3, MUL VL]\n"
- "zip1 z11.h, z13.h, z30.h\n"
- "zip2 z20.h, z13.h, z30.h\n"
- "ld1h { z18.h }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x26, #3, MUL VL]\n"
- "zip1 z12.h, z8.h, z31.h\n"
- "zip2 z14.h, z8.h, z31.h\n"
- "ld1h { z23.h }, p2/Z, [x25]\n"
- "ld1h { z28.h }, p2/Z, [x25, #1, MUL VL]\n"
- "zip1 z13.h, z15.h, z4.h\n"
- "zip2 z15.h, z15.h, z4.h\n"
- "ld1h { z2.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x25, #3, MUL VL]\n"
- "zip1 z8.h, z0.h, z16.h\n"
- "zip2 z10.h, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x12]\n"
+ "ld1h { z20.h }, p2/Z, [x12, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1h { z17.h }, p2/Z, [x11]\n"
+ "ld1h { z16.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "zip1 z25.h, z18.h, z17.h\n"
+ "zip2 z24.h, z18.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "ld1h { z18.h }, p2/Z, [x10, #1, MUL VL]\n"
+ "zip1 z23.h, z20.h, z16.h\n"
+ "zip2 z15.h, z20.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9]\n"
+ "ld1h { z16.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "zip1 z14.h, z19.h, z17.h\n"
+ "zip2 z13.h, z19.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z19.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "zip1 z12.h, z18.h, z16.h\n"
+ "zip2 z11.h, z18.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x11, #3, MUL VL]\n"
+ "mov x20, x22\n"
+ "zip1 z10.h, z17.h, z16.h\n"
+ "ld1h { z21.h }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z20.h }, p2/Z, [x10, #3, MUL VL]\n"
+ "zip2 z9.h, z17.h, z16.h\n"
+ "zip1 z8.h, z19.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "zip2 z7.h, z19.h, z18.h\n"
+ "zip1 z6.h, z21.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x28]\n"
+ "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n"
+ "zip2 z5.h, z21.h, z17.h\n"
+ "zip1 z4.h, z20.h, z16.h\n"
+ "ld1h { z22.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x28, #3, MUL VL]\n"
+ "zip2 z2.h, z20.h, z16.h\n"
+ "sub x27, x27, x26\n"
+ "ld1h { z17.h }, p2/Z, [x25]\n"
+ "ld1h { z16.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "zip1 z1.h, z19.h, z17.h\n"
+ "zip2 z0.h, z19.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z20.h }, p2/Z, [x25, #3, MUL VL]\n"
+ "zip1 z31.h, z18.h, z16.h\n"
+ "zip2 z30.h, z18.h, z16.h\n"
"ld1h { z17.h }, p2/Z, [x24]\n"
- "ld1h { z16.h }, p2/Z, [x24, #1, MUL VL]\n"
- "zip1 z6.h, z25.h, z18.h\n"
- "zip2 z5.h, z25.h, z18.h\n"
- "ld1h { z4.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z7.h }, p2/Z, [x24, #3, MUL VL]\n"
- "zip1 z30.h, z21.h, z1.h\n"
- "zip2 z3.h, z21.h, z1.h\n"
- "ld1h { z22.h }, p2/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "cmp x27, x26\n"
"addvl x12, x12, #4\n"
+ "ld1h { z29.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z28.h }, p2/Z, [x24, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "addvl x10, x10, #4\n"
+ "ld1h { z16.h }, p2/Z, [x23]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "zip1 z27.h, z17.h, z16.h\n"
+ "zip2 z26.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x23, #3, MUL VL]\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "zip1 z25.h, z19.h, z18.h\n"
+ "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
+ "zip2 z24.h, z19.h, z18.h\n"
+ "addvl x9, x9, #4\n"
"addvl x28, x28, #4\n"
- "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z31.h }, p2/Z, [x23, #3, MUL VL]\n"
- "zip1 z1.h, z23.h, z17.h\n"
- "zip2 z0.h, z23.h, z17.h\n"
- "ld1h { z25.h }, p2/Z, [x22]\n"
- "ld1h { z23.h }, p2/Z, [x22, #1, MUL VL]\n"
- "zip1 z29.h, z28.h, z16.h\n"
- "zip2 z28.h, z28.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x22, #3, MUL VL]\n"
- "st1h { z27.h }, p2, [x21]\n"
- "addvl x27, x27, #4\n"
- "st1h { z26.h }, p2, [x21, #1, MUL VL]\n"
- "addvl x26, x26, #4\n"
+ "st1h { z23.h }, p2, [x21, #2, MUL VL]\n"
"addvl x25, x25, #4\n"
- "zip1 z27.h, z2.h, z4.h\n"
- "st1h { z24.h }, p2, [x21, #2, MUL VL]\n"
- "zip1 z26.h, z22.h, z25.h\n"
- "zip2 z25.h, z22.h, z25.h\n"
"addvl x24, x24, #4\n"
- "st1h { z19.h }, p2, [x21, #3, MUL VL]\n"
- "zip1 z24.h, z21.h, z23.h\n"
- "zip2 z23.h, z21.h, z23.h\n"
+ "zip1 z23.h, z22.h, z21.h\n"
+ "st1h { z15.h }, p2, [x21, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
- "st1h { z11.h }, p2, [x21, #4, MUL VL]\n"
- "addvl x22, x22, #4\n"
- "zip2 z22.h, z2.h, z4.h\n"
- "zip1 z21.h, z9.h, z7.h\n"
- "st1h { z20.h }, p2, [x21, #5, MUL VL]\n"
- "zip2 z20.h, z9.h, z7.h\n"
- "zip1 z19.h, z18.h, z17.h\n"
+ "zip2 z22.h, z22.h, z21.h\n"
+ "zip1 z21.h, z3.h, z20.h\n"
+ "st1h { z14.h }, p2, [x21, #4, MUL VL]\n"
+ "zip2 z20.h, z3.h, z20.h\n"
+ "zip1 z19.h, z29.h, z17.h\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z13.h }, p2, [x21, #5, MUL VL]\n"
+ "zip2 z18.h, z29.h, z17.h\n"
+ "zip1 z17.h, z28.h, z16.h\n"
"st1h { z12.h }, p2, [x21, #6, MUL VL]\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z31.h, z16.h\n"
- "st1h { z14.h }, p2, [x21, #7, MUL VL]\n"
+ "zip2 z16.h, z28.h, z16.h\n"
+ "st1h { z11.h }, p2, [x21, #7, MUL VL]\n"
"addvl x21, x21, #16\n"
- "zip2 z16.h, z31.h, z16.h\n"
"st1h { z1.h }, p2, [x21, #-8, MUL VL]\n"
"st1h { z0.h }, p2, [x21, #-7, MUL VL]\n"
- "st1h { z29.h }, p2, [x21, #-6, MUL VL]\n"
- "st1h { z28.h }, p2, [x21, #-5, MUL VL]\n"
- "st1h { z26.h }, p2, [x21, #-4, MUL VL]\n"
- "st1h { z25.h }, p2, [x21, #-3, MUL VL]\n"
- "st1h { z24.h }, p2, [x21, #-2, MUL VL]\n"
- "st1h { z23.h }, p2, [x21, #-1, MUL VL]\n"
- "st1h { z13.h }, p2, [x20]\n"
- "st1h { z15.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x21, #-6, MUL VL]\n"
+ "st1h { z30.h }, p2, [x21, #-5, MUL VL]\n"
+ "st1h { z27.h }, p2, [x21, #-4, MUL VL]\n"
+ "st1h { z26.h }, p2, [x21, #-3, MUL VL]\n"
+ "st1h { z25.h }, p2, [x21, #-2, MUL VL]\n"
+ "st1h { z24.h }, p2, [x21, #-1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x20]\n"
+ "st1h { z9.h }, p2, [x20, #1, MUL VL]\n"
"st1h { z8.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z10.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z7.h }, p2, [x20, #3, MUL VL]\n"
"st1h { z6.h }, p2, [x20, #4, MUL VL]\n"
"st1h { z5.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z30.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z3.h }, p2, [x20, #7, MUL VL]\n"
+ "st1h { z4.h }, p2, [x20, #6, MUL VL]\n"
+ "st1h { z2.h }, p2, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1h { z27.h }, p2, [x20, #-8, MUL VL]\n"
+ "st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
"st1h { z22.h }, p2, [x20, #-7, MUL VL]\n"
"st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
"st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
@@ -173,63 +173,63 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x11, 5f\n"
+ "cbz x27, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x11\n"
- "mov x20, x9\n"
- "decw x11, ALL, MUL #4\n"
- "add x9, x9, %x[out_stride]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x11, #0x0\n"
- "ld1h { z21.h }, p1/Z, [x12]\n"
- "ld1h { z20.h }, p1/Z, [x28]\n"
- "ld1h { z19.h }, p1/Z, [x27]\n"
- "ld1h { z18.h }, p1/Z, [x26]\n"
- "ld1h { z1.h }, p1/Z, [x25]\n"
- "ld1h { z0.h }, p1/Z, [x24]\n"
- "ld1h { z17.h }, p0/Z, [x12, #1, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x28, #1, MUL VL]\n"
- "zip1 z23.h, z21.h, z20.h\n"
- "zip2 z31.h, z21.h, z20.h\n"
- "ld1h { z22.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z21.h }, p0/Z, [x26, #1, MUL VL]\n"
- "zip1 z30.h, z19.h, z18.h\n"
- "zip2 z29.h, z19.h, z18.h\n"
- "ld1h { z28.h }, p0/Z, [x25, #1, MUL VL]\n"
- "ld1h { z20.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "mov x20, x27\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z22.h }, p1/Z, [x12]\n"
+ "ld1h { z21.h }, p1/Z, [x11]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z20.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x10]\n"
+ "ld1h { z24.h }, p0/Z, [x10, #1, MUL VL]\n"
+ "mov x20, x22\n"
+ "decw x27, ALL, MUL #4\n"
+ "ld1h { z17.h }, p1/Z, [x9]\n"
+ "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+ "zip1 z31.h, z22.h, z21.h\n"
+ "zip2 z23.h, z22.h, z21.h\n"
+ "ld1h { z30.h }, p1/Z, [x28]\n"
+ "ld1h { z29.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "zip1 z22.h, z20.h, z19.h\n"
+ "zip2 z28.h, z20.h, z19.h\n"
+ "ld1h { z21.h }, p1/Z, [x25]\n"
+ "ld1h { z27.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "zip1 z20.h, z18.h, z17.h\n"
+ "zip2 z19.h, z18.h, z17.h\n"
+ "ld1h { z18.h }, p1/Z, [x24]\n"
+ "ld1h { z26.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "zip1 z25.h, z24.h, z16.h\n"
+ "zip2 z24.h, z24.h, z16.h\n"
+ "ld1h { z17.h }, p1/Z, [x23]\n"
+ "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x20]\n"
+ "cmp x27, #0x0\n"
+ "st1h { z23.h }, p2, [x20, #1, MUL VL]\n"
"addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
+ "zip1 z23.h, z30.h, z21.h\n"
+ "st1h { z22.h }, p2, [x20, #2, MUL VL]\n"
+ "addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "zip2 z22.h, z30.h, z21.h\n"
+ "st1h { z28.h }, p2, [x20, #3, MUL VL]\n"
"addvl x28, x28, #2\n"
- "ld1h { z27.h }, p1/Z, [x23]\n"
- "ld1h { z26.h }, p0/Z, [x23, #1, MUL VL]\n"
- "zip1 z19.h, z17.h, z16.h\n"
- "zip2 z18.h, z17.h, z16.h\n"
- "ld1h { z17.h }, p1/Z, [x22]\n"
- "ld1h { z16.h }, p0/Z, [x22, #1, MUL VL]\n"
- "zip1 z25.h, z22.h, z21.h\n"
- "zip2 z24.h, z22.h, z21.h\n"
- "st1h { z23.h }, p2, [x20]\n"
- "addvl x27, x27, #2\n"
- "addvl x26, x26, #2\n"
- "zip1 z23.h, z1.h, z0.h\n"
- "st1h { z31.h }, p2, [x20, #1, MUL VL]\n"
"addvl x25, x25, #2\n"
+ "zip1 z21.h, z29.h, z27.h\n"
+ "st1h { z20.h }, p2, [x20, #4, MUL VL]\n"
"addvl x24, x24, #2\n"
- "zip2 z22.h, z1.h, z0.h\n"
- "st1h { z19.h }, p2, [x20, #2, MUL VL]\n"
"addvl x23, x23, #2\n"
- "addvl x22, x22, #2\n"
- "zip1 z21.h, z28.h, z20.h\n"
- "st1h { z18.h }, p2, [x20, #3, MUL VL]\n"
- "zip2 z20.h, z28.h, z20.h\n"
- "zip1 z19.h, z27.h, z17.h\n"
- "st1h { z30.h }, p2, [x20, #4, MUL VL]\n"
- "zip2 z18.h, z27.h, z17.h\n"
+ "zip2 z20.h, z29.h, z27.h\n"
+ "st1h { z19.h }, p2, [x20, #5, MUL VL]\n"
+ "zip1 z19.h, z18.h, z17.h\n"
+ "zip2 z18.h, z18.h, z17.h\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
"zip1 z17.h, z26.h, z16.h\n"
- "st1h { z29.h }, p2, [x20, #5, MUL VL]\n"
"zip2 z16.h, z26.h, z16.h\n"
- "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
"st1h { z24.h }, p2, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
"st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
@@ -251,70 +251,70 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"mov x12, %x[in]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #4\n"
+ "add x11, x12, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x9, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x28, x12, %x[in_stride]\n"
- "add %x[in], x28, %x[in_stride]\n"
- "csel x28, x28, %x[pad_row], GT\n"
+ "add %x[in], x11, %x[in_stride]\n"
+ "csel x11, x11, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1h { z21.h }, p2/Z, [x12]\n"
+ "ld1h { z18.h }, p2/Z, [x12]\n"
"ld1h { z20.h }, p2/Z, [x12, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1h { z16.h }, p2/Z, [x28]\n"
- "ld1h { z19.h }, p2/Z, [x28, #1, MUL VL]\n"
"cmp x21, x20\n"
- "ld1h { z24.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z23.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x11]\n"
+ "ld1h { z16.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "zip1 z23.h, z18.h, z17.h\n"
+ "zip2 z19.h, z18.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "zip1 z21.h, z20.h, z16.h\n"
+ "zip2 z20.h, z20.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x11, #3, MUL VL]\n"
+ "st1h { z23.h }, p2, [x22]\n"
"addvl x12, x12, #4\n"
- "ld1h { z18.h }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x28, #3, MUL VL]\n"
- "addvl x28, x28, #4\n"
- "zip1 z17.h, z21.h, z16.h\n"
- "zip2 z21.h, z21.h, z16.h\n"
- "zip1 z16.h, z20.h, z19.h\n"
- "zip2 z20.h, z20.h, z19.h\n"
- "st1h { z17.h }, p2, [x9]\n"
- "zip1 z19.h, z24.h, z18.h\n"
- "zip2 z18.h, z24.h, z18.h\n"
- "zip1 z17.h, z23.h, z22.h\n"
- "st1h { z21.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #2, MUL VL]\n"
- "zip2 z16.h, z23.h, z22.h\n"
- "st1h { z20.h }, p2, [x9, #3, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
- "st1h { z19.h }, p2, [x9]\n"
- "st1h { z18.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z17.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #3, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "zip1 z19.h, z18.h, z17.h\n"
+ "zip2 z18.h, z18.h, z17.h\n"
+ "st1h { z21.h }, p2, [x22, #2, MUL VL]\n"
+ "zip1 z17.h, z22.h, z16.h\n"
+ "zip2 z16.h, z22.h, z16.h\n"
+ "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z19.h }, p2, [x22]\n"
+ "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #4\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x12]\n"
+ "ld1h { z17.h }, p0/Z, [x11]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
- "cmp x21, #0x0\n"
- "ld1h { z18.h }, p1/Z, [x12]\n"
- "ld1h { z17.h }, p1/Z, [x28]\n"
"ld1h { z20.h }, p0/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
- "ld1h { z16.h }, p0/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
+ "ld1h { z16.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "decw x21, ALL, MUL #4\n"
+ "cmp x21, #0x0\n"
"zip1 z19.h, z18.h, z17.h\n"
"zip2 z18.h, z18.h, z17.h\n"
+ "addvl x12, x12, #2\n"
+ "addvl x11, x11, #2\n"
"zip1 z17.h, z20.h, z16.h\n"
"zip2 z16.h, z20.h, z16.h\n"
- "st1h { z19.h }, p2, [x9]\n"
- "st1h { z18.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z17.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #3, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "st1h { z19.h }, p2, [x22]\n"
+ "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
index 69eccad912..bba8f107d8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
@@ -42,216 +42,216 @@ void sve_transpose_interleave_6VL_1x8(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p1.b\n"
"1:" // Main row loop: Head
"mov x10, %x[in]\n"
- "cmp %x[height], #0x7\n"
- "mov x9, %x[width]\n"
- "cntb x28, ALL, MUL #3\n"
- "mov x27, %x[out]\n"
- "add x26, x10, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
- "add %x[in], x20, %x[in_stride]\n"
- "csel x20, x20, %x[pad_row], GT\n"
- "csel x21, x21, %x[pad_row], GE\n"
+ "cmp %x[height], #0x7\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x5\n"
- "csel x22, x22, %x[pad_row], GT\n"
- "csel x23, x23, %x[pad_row], GE\n"
+ "mov x22, %x[width]\n"
+ "cntb x21, ALL, MUL #3\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "csel x26, x26, %x[pad_row], GE\n"
"cmp %x[height], #0x3\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "csel x25, x25, %x[pad_row], GE\n"
+ "csel x27, x27, %x[pad_row], GT\n"
+ "csel x28, x28, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x26, x26, %x[pad_row], GT\n"
- "cmp x9, x28\n"
+ "csel x9, x9, %x[pad_row], GT\n"
+ "cmp x22, x21\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z1.b }, p1/Z, [x10]\n"
- "ld1b { z0.b }, p1/Z, [x26]\n"
- "sub x9, x9, x28\n"
- "ld1b { z31.b }, p1/Z, [x25]\n"
- "ld1b { z28.b }, p1/Z, [x24]\n"
- "cmp x9, x28\n"
- "ld1b { z27.b }, p1/Z, [x23]\n"
- "ld1b { z26.b }, p1/Z, [x22]\n"
- "ld1b { z25.b }, p1/Z, [x21]\n"
- "ld1b { z24.b }, p1/Z, [x20]\n"
+ "ld1b { z21.b }, p1/Z, [x10]\n"
+ "ld1b { z25.b }, p1/Z, [x9]\n"
+ "sub x22, x22, x21\n"
+ "cmp x22, x21\n"
+ "ld1b { z20.b }, p1/Z, [x28]\n"
+ "ld1b { z24.b }, p1/Z, [x27]\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
+ "zip1 z7.b, z21.b, z19.b\n"
+ "zip1 z6.b, z25.b, z18.b\n"
+ "ld1b { z17.b }, p1/Z, [x24]\n"
+ "ld1b { z16.b }, p1/Z, [x23]\n"
+ "zip1 z28.b, z20.b, z17.b\n"
+ "zip1 z27.b, z24.b, z16.b\n"
"ld1b { z23.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z30.b }, p1/Z, [x26, #1, MUL VL]\n"
- "ld1b { z22.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [x24, #1, MUL VL]\n"
- "zip1 z15.b, z1.b, z27.b\n"
- "zip1 z9.b, z0.b, z26.b\n"
- "ld1b { z20.b }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1b { z19.b }, p1/Z, [x22, #1, MUL VL]\n"
- "zip1 z18.b, z31.b, z25.b\n"
- "zip1 z29.b, z28.b, z24.b\n"
- "ld1b { z17.b }, p1/Z, [x21, #1, MUL VL]\n"
- "ld1b { z16.b }, p1/Z, [x20, #1, MUL VL]\n"
- "zip2 z14.b, z1.b, z27.b\n"
- "zip2 z13.b, z31.b, z25.b\n"
- "ld1b { z8.b }, p1/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p1/Z, [x26, #2, MUL VL]\n"
- "zip2 z12.b, z0.b, z26.b\n"
- "zip2 z6.b, z28.b, z24.b\n"
- "ld1b { z5.b }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1b { z4.b }, p1/Z, [x24, #2, MUL VL]\n"
- "zip1 z3.b, z23.b, z20.b\n"
- "zip1 z11.b, z30.b, z19.b\n"
- "ld1b { z28.b }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1b { z27.b }, p1/Z, [x22, #2, MUL VL]\n"
- "zip1 z2.b, z22.b, z17.b\n"
- "zip1 z1.b, z21.b, z16.b\n"
- "ld1b { z26.b }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1b { z25.b }, p1/Z, [x20, #2, MUL VL]\n"
- "zip2 z24.b, z23.b, z20.b\n"
- "zip2 z23.b, z22.b, z17.b\n"
- "zip2 z22.b, z30.b, z19.b\n"
- "zip2 z21.b, z21.b, z16.b\n"
+ "ld1b { z22.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z5.b, z21.b, z19.b\n"
+ "zip2 z4.b, z20.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip2 z3.b, z25.b, z18.b\n"
+ "zip2 z2.b, z24.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x25, #1, MUL VL]\n"
+ "zip1 z1.b, z23.b, z19.b\n"
+ "zip1 z15.b, z22.b, z18.b\n"
+ "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x23, #1, MUL VL]\n"
+ "zip1 z0.b, z21.b, z17.b\n"
+ "zip1 z31.b, z20.b, z16.b\n"
+ "ld1b { z26.b }, p1/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z30.b }, p1/Z, [x9, #2, MUL VL]\n"
+ "zip2 z14.b, z23.b, z19.b\n"
+ "zip2 z13.b, z21.b, z17.b\n"
+ "ld1b { z25.b }, p1/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p1/Z, [x27, #2, MUL VL]\n"
+ "zip2 z12.b, z22.b, z18.b\n"
+ "zip2 z11.b, z20.b, z16.b\n"
+ "ld1b { z23.b }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1b { z22.b }, p1/Z, [x25, #2, MUL VL]\n"
+ "zip1 z10.b, z26.b, z23.b\n"
+ "zip1 z9.b, z30.b, z22.b\n"
+ "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1b { z17.b }, p1/Z, [x23, #2, MUL VL]\n"
+ "zip1 z29.b, z25.b, z21.b\n"
+ "zip1 z8.b, z24.b, z17.b\n"
+ "zip1 z19.b, z7.b, z28.b\n"
+ "zip1 z16.b, z6.b, z27.b\n"
"addvl x10, x10, #3\n"
+ "addvl x9, x9, #3\n"
+ "zip2 z28.b, z7.b, z28.b\n"
+ "zip2 z18.b, z6.b, z27.b\n"
+ "addvl x28, x28, #3\n"
+ "addvl x27, x27, #3\n"
+ "zip1 z27.b, z5.b, z4.b\n"
+ "zip1 z20.b, z3.b, z2.b\n"
"addvl x26, x26, #3\n"
- "zip1 z0.b, z8.b, z28.b\n"
- "zip1 z10.b, z7.b, z27.b\n"
"addvl x25, x25, #3\n"
+ "zip2 z7.b, z26.b, z23.b\n"
+ "zip2 z26.b, z25.b, z21.b\n"
"addvl x24, x24, #3\n"
- "zip1 z31.b, z5.b, z26.b\n"
- "zip1 z30.b, z4.b, z25.b\n"
"addvl x23, x23, #3\n"
- "addvl x22, x22, #3\n"
- "zip1 z20.b, z15.b, z18.b\n"
- "zip1 z19.b, z9.b, z29.b\n"
- "addvl x21, x21, #3\n"
- "addvl x20, x20, #3\n"
- "zip2 z18.b, z15.b, z18.b\n"
- "zip2 z16.b, z9.b, z29.b\n"
- "zip1 z29.b, z14.b, z13.b\n"
- "zip1 z17.b, z12.b, z6.b\n"
- "zip2 z9.b, z8.b, z28.b\n"
- "zip2 z28.b, z5.b, z26.b\n"
- "zip2 z8.b, z7.b, z27.b\n"
- "zip2 z27.b, z4.b, z25.b\n"
- "zip2 z7.b, z14.b, z13.b\n"
- "zip2 z6.b, z12.b, z6.b\n"
- "zip1 z5.b, z3.b, z2.b\n"
- "zip1 z4.b, z11.b, z1.b\n"
- "zip2 z3.b, z3.b, z2.b\n"
- "zip2 z2.b, z11.b, z1.b\n"
- "zip1 z26.b, z24.b, z23.b\n"
- "zip1 z25.b, z22.b, z21.b\n"
- "zip2 z24.b, z24.b, z23.b\n"
- "zip2 z23.b, z22.b, z21.b\n"
- "zip1 z1.b, z0.b, z31.b\n"
- "zip1 z22.b, z10.b, z30.b\n"
- "zip1 z21.b, z20.b, z19.b\n"
- "zip2 z20.b, z20.b, z19.b\n"
- "zip1 z19.b, z18.b, z16.b\n"
- "zip2 z18.b, z18.b, z16.b\n"
- "zip1 z16.b, z29.b, z17.b\n"
- "zip2 z17.b, z29.b, z17.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "zip2 z0.b, z0.b, z31.b\n"
- "zip2 z31.b, z10.b, z30.b\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "zip1 z30.b, z9.b, z28.b\n"
- "zip1 z29.b, z8.b, z27.b\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "zip2 z28.b, z9.b, z28.b\n"
- "zip2 z27.b, z8.b, z27.b\n"
- "st1b { z16.b }, p1, [x27, #4, MUL VL]\n"
- "zip1 z21.b, z7.b, z6.b\n"
- "zip2 z16.b, z7.b, z6.b\n"
- "st1b { z17.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z20.b, z5.b, z4.b\n"
- "zip2 z19.b, z5.b, z4.b\n"
+ "zip2 z6.b, z30.b, z22.b\n"
+ "zip2 z25.b, z24.b, z17.b\n"
+ "zip2 z5.b, z5.b, z4.b\n"
+ "zip2 z4.b, z3.b, z2.b\n"
+ "zip1 z3.b, z1.b, z0.b\n"
+ "zip1 z2.b, z15.b, z31.b\n"
+ "zip2 z1.b, z1.b, z0.b\n"
+ "zip2 z0.b, z15.b, z31.b\n"
+ "zip1 z31.b, z14.b, z13.b\n"
+ "zip1 z30.b, z12.b, z11.b\n"
+ "zip2 z24.b, z14.b, z13.b\n"
+ "zip2 z23.b, z12.b, z11.b\n"
+ "zip1 z22.b, z10.b, z29.b\n"
+ "zip1 z21.b, z9.b, z8.b\n"
+ "zip1 z17.b, z19.b, z16.b\n"
+ "zip2 z16.b, z19.b, z16.b\n"
+ "st1b { z17.b }, p1, [x20]\n"
+ "zip1 z19.b, z28.b, z18.b\n"
+ "zip2 z18.b, z28.b, z18.b\n"
+ "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+ "zip1 z17.b, z27.b, z20.b\n"
+ "zip2 z16.b, z27.b, z20.b\n"
+ "st1b { z19.b }, p1, [x20, #2, MUL VL]\n"
+ "st1b { z18.b }, p1, [x20, #3, MUL VL]\n"
+ "zip2 z29.b, z10.b, z29.b\n"
+ "zip2 z20.b, z9.b, z8.b\n"
+ "st1b { z17.b }, p1, [x20, #4, MUL VL]\n"
+ "zip1 z28.b, z7.b, z26.b\n"
+ "zip1 z27.b, z6.b, z25.b\n"
+ "st1b { z16.b }, p1, [x20, #5, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip2 z26.b, z7.b, z26.b\n"
+ "zip2 z25.b, z6.b, z25.b\n"
+ "zip1 z17.b, z5.b, z4.b\n"
+ "zip2 z16.b, z5.b, z4.b\n"
+ "st1b { z17.b }, p1, [x20]\n"
"zip1 z18.b, z3.b, z2.b\n"
"zip2 z17.b, z3.b, z2.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z16.b }, p1, [x27, #1, MUL VL]\n"
- "zip1 z16.b, z26.b, z25.b\n"
- "zip2 z26.b, z26.b, z25.b\n"
- "st1b { z20.b }, p1, [x27, #2, MUL VL]\n"
- "zip1 z25.b, z24.b, z23.b\n"
+ "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+ "zip1 z16.b, z1.b, z0.b\n"
+ "zip2 z19.b, z1.b, z0.b\n"
+ "st1b { z18.b }, p1, [x20, #2, MUL VL]\n"
+ "st1b { z17.b }, p1, [x20, #3, MUL VL]\n"
+ "zip1 z18.b, z31.b, z30.b\n"
+ "zip2 z17.b, z31.b, z30.b\n"
+ "st1b { z16.b }, p1, [x20, #4, MUL VL]\n"
+ "zip1 z16.b, z24.b, z23.b\n"
"zip2 z24.b, z24.b, z23.b\n"
- "st1b { z19.b }, p1, [x27, #3, MUL VL]\n"
- "zip1 z23.b, z1.b, z22.b\n"
- "zip2 z22.b, z1.b, z22.b\n"
- "st1b { z18.b }, p1, [x27, #4, MUL VL]\n"
- "zip1 z21.b, z0.b, z31.b\n"
- "zip2 z20.b, z0.b, z31.b\n"
- "st1b { z17.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "zip1 z19.b, z30.b, z29.b\n"
- "zip2 z18.b, z30.b, z29.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "zip1 z17.b, z28.b, z27.b\n"
- "zip2 z16.b, z28.b, z27.b\n"
- "st1b { z26.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z25.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z24.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z23.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z22.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z19.b }, p1, [x20, #5, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip1 z23.b, z22.b, z21.b\n"
+ "zip2 z22.b, z22.b, z21.b\n"
+ "st1b { z18.b }, p1, [x20]\n"
+ "zip1 z21.b, z29.b, z20.b\n"
+ "zip2 z20.b, z29.b, z20.b\n"
+ "st1b { z17.b }, p1, [x20, #1, MUL VL]\n"
+ "zip1 z19.b, z28.b, z27.b\n"
+ "zip2 z18.b, z28.b, z27.b\n"
+ "st1b { z16.b }, p1, [x20, #2, MUL VL]\n"
+ "zip1 z17.b, z26.b, z25.b\n"
+ "zip2 z16.b, z26.b, z25.b\n"
+ "st1b { z24.b }, p1, [x20, #3, MUL VL]\n"
+ "st1b { z23.b }, p1, [x20, #4, MUL VL]\n"
+ "st1b { z22.b }, p1, [x20, #5, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "st1b { z21.b }, p1, [x20]\n"
+ "st1b { z20.b }, p1, [x20, #1, MUL VL]\n"
+ "st1b { z19.b }, p1, [x20, #2, MUL VL]\n"
+ "st1b { z18.b }, p1, [x20, #3, MUL VL]\n"
+ "st1b { z17.b }, p1, [x20, #4, MUL VL]\n"
+ "st1b { z16.b }, p1, [x20, #5, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x9, 5f\n"
+ "cbz x22, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x9\n"
- "decd x9, ALL, MUL #6\n"
- "ld1b { z21.b }, p0/Z, [x10]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1b { z23.b }, p0/Z, [x10]\n"
+ "ld1b { z27.b }, p0/Z, [x9]\n"
+ "decd x22, ALL, MUL #6\n"
+ "ld1b { z21.b }, p0/Z, [x28]\n"
+ "ld1b { z26.b }, p0/Z, [x27]\n"
+ "cmp x22, #0x0\n"
"incd x10, ALL, MUL #6\n"
- "ld1b { z26.b }, p0/Z, [x26]\n"
+ "ld1b { z20.b }, p0/Z, [x26]\n"
+ "ld1b { z19.b }, p0/Z, [x25]\n"
+ "zip1 z25.b, z23.b, z20.b\n"
+ "zip1 z24.b, z27.b, z19.b\n"
+ "ld1b { z17.b }, p0/Z, [x24]\n"
+ "ld1b { z16.b }, p0/Z, [x23]\n"
+ "zip1 z22.b, z21.b, z17.b\n"
+ "zip1 z18.b, z26.b, z16.b\n"
+ "zip2 z23.b, z23.b, z20.b\n"
+ "zip2 z21.b, z21.b, z17.b\n"
+ "incd x9, ALL, MUL #6\n"
+ "incd x28, ALL, MUL #6\n"
+ "zip2 z20.b, z27.b, z19.b\n"
+ "zip2 z17.b, z26.b, z16.b\n"
+ "incd x27, ALL, MUL #6\n"
"incd x26, ALL, MUL #6\n"
- "ld1b { z20.b }, p0/Z, [x25]\n"
+ "zip1 z19.b, z25.b, z22.b\n"
+ "zip1 z16.b, z24.b, z18.b\n"
"incd x25, ALL, MUL #6\n"
- "ld1b { z25.b }, p0/Z, [x24]\n"
"incd x24, ALL, MUL #6\n"
- "ld1b { z18.b }, p0/Z, [x23]\n"
- "ld1b { z19.b }, p0/Z, [x22]\n"
- "ld1b { z17.b }, p0/Z, [x21]\n"
- "cmp x9, #0x0\n"
+ "zip2 z22.b, z25.b, z22.b\n"
+ "zip2 z18.b, z24.b, z18.b\n"
"incd x23, ALL, MUL #6\n"
- "ld1b { z16.b }, p0/Z, [x20]\n"
- "incd x22, ALL, MUL #6\n"
- "incd x21, ALL, MUL #6\n"
- "zip1 z23.b, z21.b, z18.b\n"
- "zip2 z24.b, z21.b, z18.b\n"
- "incd x20, ALL, MUL #6\n"
- "zip1 z22.b, z20.b, z17.b\n"
- "zip1 z21.b, z26.b, z19.b\n"
- "zip2 z18.b, z20.b, z17.b\n"
- "zip1 z17.b, z25.b, z16.b\n"
- "zip2 z20.b, z26.b, z19.b\n"
- "zip2 z16.b, z25.b, z16.b\n"
- "zip1 z19.b, z23.b, z22.b\n"
- "zip2 z23.b, z23.b, z22.b\n"
- "zip1 z22.b, z24.b, z18.b\n"
- "zip1 z18.b, z21.b, z17.b\n"
- "zip2 z17.b, z21.b, z17.b\n"
- "zip1 z16.b, z20.b, z16.b\n"
- "zip1 z21.b, z19.b, z18.b\n"
- "zip2 z20.b, z19.b, z18.b\n"
- "zip1 z19.b, z23.b, z17.b\n"
- "zip2 z18.b, z23.b, z17.b\n"
- "zip1 z17.b, z22.b, z16.b\n"
- "zip2 z16.b, z22.b, z16.b\n"
- "st1b { z21.b }, p1, [x27]\n"
- "st1b { z20.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #5, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "zip1 z21.b, z23.b, z21.b\n"
+ "zip1 z20.b, z20.b, z17.b\n"
+ "zip1 z17.b, z19.b, z16.b\n"
+ "zip2 z16.b, z19.b, z16.b\n"
+ "st1b { z17.b }, p1, [x20]\n"
+ "zip1 z19.b, z22.b, z18.b\n"
+ "zip2 z18.b, z22.b, z18.b\n"
+ "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+ "zip1 z17.b, z21.b, z20.b\n"
+ "zip2 z16.b, z21.b, z20.b\n"
+ "st1b { z19.b }, p1, [x20, #2, MUL VL]\n"
+ "st1b { z18.b }, p1, [x20, #3, MUL VL]\n"
+ "st1b { z17.b }, p1, [x20, #4, MUL VL]\n"
+ "st1b { z16.b }, p1, [x20, #5, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
index 6c0a5c029b..ebfc65be34 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
@@ -44,125 +44,125 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"blt 6f\n"
"1:" // Main row loop: Head
"mov x12, %x[in]\n"
- "mov x11, %x[width]\n"
- "cnth x10, ALL, MUL #3\n"
- "mov x9, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x28, x12, %x[in_stride]\n"
- "add x27, x28, %x[in_stride]\n"
- "add x26, x27, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "cmp x11, x10\n"
+ "add x11, x12, %x[in_stride]\n"
+ "add x10, x11, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "mov x27, %x[width]\n"
+ "cnth x26, ALL, MUL #3\n"
+ "add x25, x28, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "cmp x27, x26\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z19.h }, p2/Z, [x12]\n"
+ "ld1h { z18.h }, p2/Z, [x12]\n"
"ld1h { z13.h }, p2/Z, [x12, #1, MUL VL]\n"
- "mov x21, x9\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z18.h }, p2/Z, [x28]\n"
- "ld1h { z12.h }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x20, x9\n"
- "sub x11, x11, x10\n"
- "ld1h { z16.h }, p2/Z, [x27]\n"
- "ld1h { z11.h }, p2/Z, [x27, #1, MUL VL]\n"
- "cmp x11, x10\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z17.h }, p2/Z, [x26]\n"
- "ld1h { z10.h }, p2/Z, [x26, #1, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "ld1h { z8.h }, p2/Z, [x24]\n"
- "ld1h { z28.h }, p2/Z, [x23]\n"
- "ld1h { z26.h }, p2/Z, [x22]\n"
- "zip1 z24.h, z19.h, z16.h\n"
- "zip2 z23.h, z19.h, z16.h\n"
- "ld1h { z7.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z6.h }, p2/Z, [x28, #2, MUL VL]\n"
- "zip1 z16.h, z18.h, z17.h\n"
- "zip2 z22.h, z18.h, z17.h\n"
- "ld1h { z5.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x26, #2, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1h { z17.h }, p2/Z, [x11]\n"
+ "ld1h { z12.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "mov x20, x22\n"
+ "sub x27, x27, x26\n"
+ "ld1h { z16.h }, p2/Z, [x10]\n"
+ "ld1h { z11.h }, p2/Z, [x10, #1, MUL VL]\n"
+ "zip1 z23.h, z18.h, z16.h\n"
+ "zip2 z29.h, z18.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x9]\n"
+ "ld1h { z10.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "zip1 z22.h, z17.h, z16.h\n"
+ "zip2 z28.h, z17.h, z16.h\n"
+ "ld1h { z27.h }, p2/Z, [x28]\n"
+ "ld1h { z26.h }, p2/Z, [x25]\n"
"zip1 z21.h, z13.h, z11.h\n"
"zip1 z20.h, z12.h, z10.h\n"
- "ld1h { z3.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z2.h }, p2/Z, [x24, #1, MUL VL]\n"
- "zip1 z19.h, z9.h, z28.h\n"
- "zip1 z18.h, z8.h, z26.h\n"
- "ld1h { z27.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x22, #1, MUL VL]\n"
- "zip1 z17.h, z24.h, z16.h\n"
- "zip2 z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z31.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x24]\n"
+ "ld1h { z19.h }, p2/Z, [x23]\n"
+ "zip1 z17.h, z27.h, z18.h\n"
+ "zip1 z16.h, z26.h, z19.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z8.h }, p2/Z, [x11, #2, MUL VL]\n"
"zip1 z25.h, z23.h, z22.h\n"
"zip2 z24.h, z23.h, z22.h\n"
- "ld1h { z30.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z29.h }, p2/Z, [x22, #2, MUL VL]\n"
- "zip1 z23.h, z21.h, z20.h\n"
- "zip2 z22.h, z21.h, z20.h\n"
- "zip1 z21.h, z19.h, z18.h\n"
- "zip2 z20.h, z19.h, z18.h\n"
- "st1h { z17.h }, p2, [x21]\n"
+ "ld1h { z23.h }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "zip1 z22.h, z29.h, z28.h\n"
+ "zip2 z6.h, z29.h, z28.h\n"
+ "ld1h { z28.h }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z5.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "zip1 z4.h, z21.h, z20.h\n"
+ "zip2 z3.h, z21.h, z20.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z20.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "zip1 z2.h, z17.h, z16.h\n"
+ "zip2 z1.h, z17.h, z16.h\n"
+ "ld1h { z0.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z31.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "zip2 z18.h, z27.h, z18.h\n"
+ "zip2 z17.h, z26.h, z19.h\n"
+ "ld1h { z30.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z29.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z19.h, z28.h, z21.h\n"
+ "zip1 z16.h, z5.h, z20.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "zip2 z27.h, z13.h, z11.h\n"
+ "zip2 z26.h, z12.h, z10.h\n"
+ "cmp x27, x26\n"
+ "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
+ "zip1 z25.h, z9.h, z23.h\n"
+ "zip1 z24.h, z8.h, z7.h\n"
"addvl x12, x12, #3\n"
- "zip2 z19.h, z9.h, z28.h\n"
- "zip2 z18.h, z8.h, z26.h\n"
- "st1h { z16.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "zip2 z23.h, z9.h, z23.h\n"
+ "zip2 z22.h, z8.h, z7.h\n"
+ "addvl x11, x11, #3\n"
+ "st1h { z6.h }, p2, [x21, #3, MUL VL]\n"
+ "zip2 z28.h, z28.h, z21.h\n"
+ "zip2 z21.h, z5.h, z20.h\n"
+ "addvl x10, x10, #3\n"
+ "st1h { z4.h }, p2, [x21, #4, MUL VL]\n"
+ "zip1 z20.h, z18.h, z17.h\n"
+ "zip2 z18.h, z18.h, z17.h\n"
+ "addvl x9, x9, #3\n"
+ "st1h { z3.h }, p2, [x21, #5, MUL VL]\n"
+ "zip1 z17.h, z19.h, z16.h\n"
+ "zip2 z16.h, z19.h, z16.h\n"
"addvl x28, x28, #3\n"
- "zip1 z17.h, z3.h, z27.h\n"
- "zip1 z16.h, z2.h, z1.h\n"
- "st1h { z25.h }, p2, [x21, #2, MUL VL]\n"
- "addvl x27, x27, #3\n"
- "st1h { z24.h }, p2, [x21, #3, MUL VL]\n"
- "zip2 z26.h, z13.h, z11.h\n"
- "zip2 z25.h, z12.h, z10.h\n"
- "addvl x26, x26, #3\n"
- "st1h { z23.h }, p2, [x21, #4, MUL VL]\n"
- "zip1 z28.h, z7.h, z5.h\n"
- "zip1 z24.h, z6.h, z4.h\n"
+ "st1h { z2.h }, p2, [x21, #6, MUL VL]\n"
+ "zip1 z19.h, z27.h, z26.h\n"
+ "zip2 z27.h, z27.h, z26.h\n"
"addvl x25, x25, #3\n"
- "st1h { z22.h }, p2, [x21, #5, MUL VL]\n"
- "zip2 z23.h, z7.h, z5.h\n"
- "zip2 z22.h, z6.h, z4.h\n"
- "addvl x24, x24, #3\n"
- "st1h { z21.h }, p2, [x21, #6, MUL VL]\n"
- "zip2 z27.h, z3.h, z27.h\n"
- "zip2 z21.h, z2.h, z1.h\n"
- "addvl x23, x23, #3\n"
- "st1h { z20.h }, p2, [x21, #7, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #7, MUL VL]\n"
"addvl x21, x21, #12\n"
- "zip1 z20.h, z19.h, z18.h\n"
- "zip2 z19.h, z19.h, z18.h\n"
- "zip1 z18.h, z17.h, z16.h\n"
- "zip2 z17.h, z17.h, z16.h\n"
- "addvl x22, x22, #3\n"
- "zip1 z16.h, z26.h, z25.h\n"
- "zip2 z26.h, z26.h, z25.h\n"
+ "zip1 z26.h, z25.h, z24.h\n"
+ "zip2 z25.h, z25.h, z24.h\n"
"st1h { z20.h }, p2, [x21, #-4, MUL VL]\n"
- "st1h { z19.h }, p2, [x21, #-3, MUL VL]\n"
- "zip1 z25.h, z28.h, z24.h\n"
- "zip2 z19.h, z28.h, z24.h\n"
- "st1h { z18.h }, p2, [x21, #-2, MUL VL]\n"
"zip1 z24.h, z23.h, z22.h\n"
"zip2 z23.h, z23.h, z22.h\n"
- "st1h { z17.h }, p2, [x21, #-1, MUL VL]\n"
- "zip1 z22.h, z27.h, z21.h\n"
- "zip2 z21.h, z27.h, z21.h\n"
+ "addvl x24, x24, #3\n"
+ "st1h { z18.h }, p2, [x21, #-3, MUL VL]\n"
+ "zip1 z22.h, z28.h, z21.h\n"
+ "zip2 z21.h, z28.h, z21.h\n"
+ "addvl x23, x23, #3\n"
+ "st1h { z17.h }, p2, [x21, #-2, MUL VL]\n"
"zip1 z18.h, z0.h, z30.h\n"
"zip1 z17.h, z31.h, z29.h\n"
- "st1h { z16.h }, p2, [x20]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z16.h }, p2, [x21, #-1, MUL VL]\n"
"zip2 z20.h, z0.h, z30.h\n"
"zip2 z16.h, z31.h, z29.h\n"
- "st1h { z26.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z25.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z19.h }, p2, [x20]\n"
"zip1 z19.h, z18.h, z17.h\n"
"zip2 z18.h, z18.h, z17.h\n"
- "st1h { z24.h }, p2, [x20, #4, MUL VL]\n"
+ "st1h { z27.h }, p2, [x20, #1, MUL VL]\n"
"zip1 z17.h, z20.h, z16.h\n"
"zip2 z16.h, z20.h, z16.h\n"
+ "st1h { z26.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z25.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z24.h }, p2, [x20, #4, MUL VL]\n"
"st1h { z23.h }, p2, [x20, #5, MUL VL]\n"
"st1h { z22.h }, p2, [x20, #6, MUL VL]\n"
"st1h { z21.h }, p2, [x20, #7, MUL VL]\n"
@@ -173,79 +173,79 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x11, 5f\n"
+ "cbz x27, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x11\n"
- "mov x20, x9\n"
- "decd x11, ALL, MUL #6\n"
- "add x9, x9, %x[out_stride]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x11, #0x0\n"
- "ld1h { z23.h }, p1/Z, [x12]\n"
- "ld1h { z25.h }, p1/Z, [x28]\n"
- "ld1h { z22.h }, p1/Z, [x27]\n"
- "ld1h { z19.h }, p1/Z, [x26]\n"
- "ld1h { z1.h }, p1/Z, [x25]\n"
- "ld1h { z0.h }, p1/Z, [x24]\n"
- "ld1h { z21.h }, p0/Z, [x12, #1, MUL VL]\n"
- "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "mov x20, x27\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z17.h }, p1/Z, [x12]\n"
+ "ld1h { z19.h }, p1/Z, [x11]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z22.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z21.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x10]\n"
+ "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+ "zip1 z25.h, z17.h, z16.h\n"
+ "zip2 z24.h, z17.h, z16.h\n"
+ "ld1h { z18.h }, p1/Z, [x9]\n"
+ "ld1h { z17.h }, p0/Z, [x9, #1, MUL VL]\n"
+ "zip1 z16.h, z19.h, z18.h\n"
+ "zip2 z19.h, z19.h, z18.h\n"
+ "ld1h { z0.h }, p1/Z, [x28]\n"
+ "ld1h { z31.h }, p1/Z, [x25]\n"
+ "zip1 z23.h, z22.h, z20.h\n"
+ "zip1 z22.h, z21.h, z17.h\n"
+ "ld1h { z30.h }, p1/Z, [x24]\n"
+ "ld1h { z29.h }, p1/Z, [x23]\n"
+ "zip1 z21.h, z0.h, z30.h\n"
+ "zip1 z18.h, z31.h, z29.h\n"
+ "ld1h { z28.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z27.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "mov x20, x22\n"
+ "decd x27, ALL, MUL #6\n"
+ "ld1h { z20.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z26.h }, p0/Z, [x23, #1, MUL VL]\n"
"addvl x12, x12, #1\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
+ "zip1 z17.h, z25.h, z16.h\n"
+ "zip2 z16.h, z25.h, z16.h\n"
"addvl x28, x28, #1\n"
- "ld1h { z18.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z17.h }, p0/Z, [x26, #1, MUL VL]\n"
- "zip1 z24.h, z23.h, z22.h\n"
- "zip1 z16.h, z25.h, z19.h\n"
- "ld1h { z31.h }, p1/Z, [x23]\n"
- "ld1h { z30.h }, p1/Z, [x22]\n"
- "zip2 z23.h, z23.h, z22.h\n"
- "zip2 z19.h, z25.h, z19.h\n"
- "ld1h { z29.h }, p0/Z, [x25, #1, MUL VL]\n"
- "ld1h { z28.h }, p0/Z, [x24, #1, MUL VL]\n"
- "addvl x27, x27, #1\n"
- "addvl x26, x26, #1\n"
- "ld1h { z27.h }, p0/Z, [x23, #1, MUL VL]\n"
- "ld1h { z26.h }, p0/Z, [x22, #1, MUL VL]\n"
- "zip1 z22.h, z21.h, z18.h\n"
- "zip1 z21.h, z20.h, z17.h\n"
- "zip1 z20.h, z1.h, z31.h\n"
- "zip1 z18.h, z0.h, z30.h\n"
"addvl x25, x25, #1\n"
+ "zip1 z25.h, z24.h, z19.h\n"
+ "zip2 z19.h, z24.h, z19.h\n"
"addvl x24, x24, #1\n"
"addvl x23, x23, #1\n"
- "addvl x22, x22, #1\n"
- "zip1 z17.h, z24.h, z16.h\n"
- "zip2 z16.h, z24.h, z16.h\n"
- "zip1 z25.h, z23.h, z19.h\n"
- "zip2 z24.h, z23.h, z19.h\n"
- "incd x12, ALL, MUL #4\n"
- "incd x28, ALL, MUL #4\n"
- "zip1 z19.h, z22.h, z21.h\n"
- "zip2 z23.h, z22.h, z21.h\n"
- "incd x27, ALL, MUL #4\n"
- "incd x26, ALL, MUL #4\n"
- "zip1 z22.h, z20.h, z18.h\n"
- "zip2 z21.h, z20.h, z18.h\n"
+ "zip1 z24.h, z23.h, z22.h\n"
+ "zip2 z23.h, z23.h, z22.h\n"
+ "zip1 z22.h, z21.h, z18.h\n"
+ "zip2 z21.h, z21.h, z18.h\n"
"st1h { z17.h }, p2, [x20]\n"
- "incd x25, ALL, MUL #4\n"
- "zip2 z18.h, z1.h, z31.h\n"
- "zip2 z17.h, z0.h, z30.h\n"
+ "cmp x27, #0x0\n"
+ "zip2 z18.h, z0.h, z30.h\n"
+ "zip2 z17.h, z31.h, z29.h\n"
"st1h { z16.h }, p2, [x20, #1, MUL VL]\n"
- "incd x24, ALL, MUL #4\n"
- "zip1 z20.h, z29.h, z27.h\n"
- "zip1 z16.h, z28.h, z26.h\n"
+ "incd x12, ALL, MUL #4\n"
+ "zip1 z20.h, z28.h, z20.h\n"
+ "zip1 z16.h, z27.h, z26.h\n"
"st1h { z25.h }, p2, [x20, #2, MUL VL]\n"
- "incd x23, ALL, MUL #4\n"
- "st1h { z24.h }, p2, [x20, #3, MUL VL]\n"
- "incd x22, ALL, MUL #4\n"
- "st1h { z19.h }, p2, [x20, #4, MUL VL]\n"
+ "incd x11, ALL, MUL #4\n"
+ "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+ "incd x10, ALL, MUL #4\n"
+ "incd x9, ALL, MUL #4\n"
"zip1 z19.h, z18.h, z17.h\n"
+ "st1h { z24.h }, p2, [x20, #4, MUL VL]\n"
+ "incd x28, ALL, MUL #4\n"
+ "incd x25, ALL, MUL #4\n"
"zip2 z18.h, z18.h, z17.h\n"
"st1h { z23.h }, p2, [x20, #5, MUL VL]\n"
+ "incd x24, ALL, MUL #4\n"
+ "incd x23, ALL, MUL #4\n"
"zip1 z17.h, z20.h, z16.h\n"
- "zip2 z16.h, z20.h, z16.h\n"
"st1h { z22.h }, p2, [x20, #6, MUL VL]\n"
+ "zip2 z16.h, z20.h, z16.h\n"
+ "add x22, x22, %x[out_stride]\n"
"st1h { z21.h }, p2, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
"st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
@@ -261,123 +261,123 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x12, %x[in]\n"
+ "add x11, x12, %x[in_stride]\n"
+ "add x10, x11, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #3\n"
+ "add x9, x10, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x9, %x[out]\n"
- "add x28, x12, %x[in_stride]\n"
- "add x27, x28, %x[in_stride]\n"
- "add x26, x27, %x[in_stride]\n"
- "add %x[in], x26, %x[in_stride]\n"
- "csel x26, x26, %x[pad_row], GT\n"
- "csel x27, x27, %x[pad_row], GE\n"
+ "add %x[in], x9, %x[in_stride]\n"
+ "csel x9, x9, %x[pad_row], GT\n"
+ "csel x10, x10, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x28, x28, %x[pad_row], GT\n"
+ "csel x11, x11, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1h { z22.h }, p2/Z, [x12]\n"
- "ld1h { z28.h }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x12]\n"
+ "ld1h { z24.h }, p2/Z, [x12, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1h { z21.h }, p2/Z, [x28]\n"
- "ld1h { z27.h }, p2/Z, [x28, #1, MUL VL]\n"
"cmp x21, x20\n"
- "ld1h { z17.h }, p2/Z, [x27]\n"
- "ld1h { z26.h }, p2/Z, [x27, #1, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x26]\n"
- "ld1h { z20.h }, p2/Z, [x26, #1, MUL VL]\n"
- "ld1h { z25.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z29.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x11]\n"
+ "ld1h { z23.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x10]\n"
+ "ld1h { z22.h }, p2/Z, [x10, #1, MUL VL]\n"
+ "zip1 z31.h, z18.h, z16.h\n"
+ "zip2 z30.h, z18.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x9]\n"
+ "ld1h { z20.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "zip1 z29.h, z17.h, z16.h\n"
+ "zip2 z28.h, z17.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "zip1 z27.h, z24.h, z22.h\n"
+ "zip1 z21.h, z23.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "zip2 z26.h, z24.h, z22.h\n"
+ "zip2 z20.h, z23.h, z20.h\n"
+ "zip1 z25.h, z19.h, z17.h\n"
+ "zip1 z24.h, z18.h, z16.h\n"
"addvl x12, x12, #3\n"
- "addvl x28, x28, #3\n"
- "ld1h { z24.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z19.h }, p2/Z, [x26, #2, MUL VL]\n"
- "zip1 z18.h, z22.h, z17.h\n"
- "zip2 z23.h, z22.h, z17.h\n"
- "zip1 z17.h, z21.h, z16.h\n"
- "zip2 z16.h, z21.h, z16.h\n"
- "addvl x27, x27, #3\n"
- "addvl x26, x26, #3\n"
- "zip1 z22.h, z28.h, z26.h\n"
- "zip1 z21.h, z27.h, z20.h\n"
- "zip2 z28.h, z28.h, z26.h\n"
- "zip2 z20.h, z27.h, z20.h\n"
- "zip1 z27.h, z25.h, z24.h\n"
- "zip1 z26.h, z29.h, z19.h\n"
- "zip2 z25.h, z25.h, z24.h\n"
- "zip2 z24.h, z29.h, z19.h\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z23.h, z16.h\n"
- "zip2 z16.h, z23.h, z16.h\n"
- "zip1 z23.h, z22.h, z21.h\n"
- "zip2 z22.h, z22.h, z21.h\n"
- "st1h { z19.h }, p2, [x9]\n"
- "st1h { z18.h }, p2, [x9, #1, MUL VL]\n"
- "zip1 z21.h, z28.h, z20.h\n"
- "zip2 z20.h, z28.h, z20.h\n"
- "st1h { z17.h }, p2, [x9, #2, MUL VL]\n"
- "zip1 z19.h, z27.h, z26.h\n"
- "zip2 z18.h, z27.h, z26.h\n"
- "st1h { z16.h }, p2, [x9, #3, MUL VL]\n"
- "zip1 z17.h, z25.h, z24.h\n"
- "zip2 z16.h, z25.h, z24.h\n"
- "st1h { z23.h }, p2, [x9, #4, MUL VL]\n"
- "st1h { z22.h }, p2, [x9, #5, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
- "st1h { z21.h }, p2, [x9]\n"
- "st1h { z20.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z19.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z18.h }, p2, [x9, #3, MUL VL]\n"
- "st1h { z17.h }, p2, [x9, #4, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #5, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "addvl x11, x11, #3\n"
+ "zip2 z23.h, z19.h, z17.h\n"
+ "zip2 z22.h, z18.h, z16.h\n"
+ "addvl x10, x10, #3\n"
+ "addvl x9, x9, #3\n"
+ "zip1 z17.h, z31.h, z29.h\n"
+ "zip2 z16.h, z31.h, z29.h\n"
+ "st1h { z17.h }, p2, [x22]\n"
+ "zip1 z19.h, z30.h, z28.h\n"
+ "zip2 z18.h, z30.h, z28.h\n"
+ "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+ "zip1 z17.h, z27.h, z21.h\n"
+ "zip2 z16.h, z27.h, z21.h\n"
+ "st1h { z19.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+ "zip1 z21.h, z26.h, z20.h\n"
+ "zip2 z20.h, z26.h, z20.h\n"
+ "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+ "zip1 z19.h, z25.h, z24.h\n"
+ "zip2 z18.h, z25.h, z24.h\n"
+ "st1h { z16.h }, p2, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z17.h, z23.h, z22.h\n"
+ "zip2 z16.h, z23.h, z22.h\n"
+ "st1h { z21.h }, p2, [x22]\n"
+ "st1h { z20.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z19.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decd x21, ALL, MUL #6\n"
"whilelt p1.h, XZR, x20\n"
+ "ld1h { z22.h }, p1/Z, [x12]\n"
+ "ld1h { z25.h }, p1/Z, [x11]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
- "cmp x21, #0x0\n"
- "ld1h { z25.h }, p1/Z, [x12]\n"
- "ld1h { z24.h }, p1/Z, [x28]\n"
- "ld1h { z18.h }, p1/Z, [x27]\n"
- "ld1h { z17.h }, p1/Z, [x26]\n"
- "ld1h { z22.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z24.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z21.h }, p1/Z, [x10]\n"
+ "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+ "decd x21, ALL, MUL #6\n"
"addvl x12, x12, #1\n"
- "ld1h { z23.h }, p0/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #1\n"
- "ld1h { z21.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x26, #1, MUL VL]\n"
- "addvl x27, x27, #1\n"
- "addvl x26, x26, #1\n"
- "zip1 z20.h, z25.h, z18.h\n"
- "zip1 z19.h, z24.h, z17.h\n"
+ "ld1h { z18.h }, p1/Z, [x9]\n"
+ "ld1h { z17.h }, p0/Z, [x9, #1, MUL VL]\n"
+ "addvl x11, x11, #1\n"
+ "addvl x10, x10, #1\n"
+ "addvl x9, x9, #1\n"
+ "zip1 z19.h, z22.h, z21.h\n"
+ "zip1 z16.h, z25.h, z18.h\n"
+ "cmp x21, #0x0\n"
+ "zip2 z22.h, z22.h, z21.h\n"
"zip2 z18.h, z25.h, z18.h\n"
- "zip2 z17.h, z24.h, z17.h\n"
"incd x12, ALL, MUL #4\n"
- "incd x28, ALL, MUL #4\n"
- "zip1 z22.h, z22.h, z21.h\n"
- "zip1 z16.h, z23.h, z16.h\n"
- "incd x27, ALL, MUL #4\n"
- "incd x26, ALL, MUL #4\n"
- "zip1 z21.h, z20.h, z19.h\n"
- "zip2 z20.h, z20.h, z19.h\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z22.h, z16.h\n"
- "zip2 z16.h, z22.h, z16.h\n"
- "st1h { z21.h }, p2, [x9]\n"
- "st1h { z20.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z19.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z18.h }, p2, [x9, #3, MUL VL]\n"
- "st1h { z17.h }, p2, [x9, #4, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #5, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "incd x11, ALL, MUL #4\n"
+ "zip1 z21.h, z24.h, z20.h\n"
+ "zip1 z20.h, z23.h, z17.h\n"
+ "incd x10, ALL, MUL #4\n"
+ "incd x9, ALL, MUL #4\n"
+ "zip1 z17.h, z19.h, z16.h\n"
+ "zip2 z16.h, z19.h, z16.h\n"
+ "st1h { z17.h }, p2, [x22]\n"
+ "zip1 z19.h, z22.h, z18.h\n"
+ "zip2 z18.h, z22.h, z18.h\n"
+ "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+ "zip1 z17.h, z21.h, z20.h\n"
+ "zip2 z16.h, z21.h, z20.h\n"
+ "st1h { z19.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
index 8ff9551c19..f23b9011d0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
@@ -39,176 +39,176 @@ void sve_transpose_interleave_6VL_2x4_fp32bf16(bfloat16 *out, const float *in, s
size_t out_stride = 6 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
__asm__ __volatile__(
- "ptrue p2.b\n"
+ "ptrue p3.b\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
"cnth x20, ALL, MUL #3\n"
+ "add x22, x24, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x24, %x[out]\n"
- "add x23, x26, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add %x[in], x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GT\n"
- "csel x22, x22, %x[pad_row], GE\n"
+ "add %x[in], x22, %x[in_stride]\n"
+ "csel x22, x22, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x23, x23, %x[pad_row], GT\n"
- "cmp x25, x20\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "cmp x23, x20\n"
+ "mov x21, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z24.s }, p2/Z, [x26, #1, MUL VL]\n"
- "sub x25, x25, x20\n"
- "ld1w { z23.s }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x22]\n"
- "cmp x25, x20\n"
- "ld1w { z22.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x26, #3, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x26, #4, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x26, #5, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x22, #3, MUL VL]\n"
- "zip1 z4.s, z17.s, z16.s\n"
- "zip2 z3.s, z17.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x22, #4, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x22, #5, MUL VL]\n"
- "zip1 z2.s, z24.s, z22.s\n"
- "zip2 z1.s, z24.s, z22.s\n"
- "ld1w { z0.s }, p2/Z, [x23]\n"
- "ld1w { z31.s }, p2/Z, [x23, #1, MUL VL]\n"
- "zip1 z30.s, z23.s, z21.s\n"
- "zip2 z29.s, z23.s, z21.s\n"
- "ld1w { z28.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- "zip1 z26.s, z20.s, z18.s\n"
- "zip2 z25.s, z20.s, z18.s\n"
- "ld1w { z24.s }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [x21, #2, MUL VL]\n"
- "zip1 z22.s, z19.s, z17.s\n"
- "zip2 z10.s, z19.s, z17.s\n"
- "ld1w { z21.s }, p2/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z19.s, z5.s, z16.s\n"
- "zip2 z9.s, z5.s, z16.s\n"
- "ld1w { z8.s }, p2/Z, [x23, #5, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x21, #3, MUL VL]\n"
- ".inst 0x658aa887 // bfcvt z7.h, p2/M, z4.s\n"
- "zip1 z6.s, z0.s, z27.s\n"
- "ld1w { z17.s }, p2/Z, [x21, #4, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x21, #5, MUL VL]\n"
- ".inst 0x658aa865 // bfcvt z5.h, p2/M, z3.s\n"
- "zip2 z4.s, z0.s, z27.s\n"
- ".inst 0x658aa843 // bfcvt z3.h, p2/M, z2.s\n"
- "zip1 z2.s, z31.s, z24.s\n"
+ "ld1w { z17.s }, p3/Z, [x26]\n"
+ "ld1w { z18.s }, p3/Z, [x26, #1, MUL VL]\n"
+ "sub x23, x23, x20\n"
+ "cmp x23, x20\n"
+ "ld1w { z19.s }, p3/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x24]\n"
+ "zip1 z21.s, z17.s, z16.s\n"
+ "zip2 z20.s, z17.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x24, #2, MUL VL]\n"
+ "zip1 z29.s, z18.s, z17.s\n"
+ "zip2 z28.s, z18.s, z17.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x26, #4, MUL VL]\n"
+ "zip1 z27.s, z19.s, z16.s\n"
+ "zip2 z26.s, z19.s, z16.s\n"
+ "ld1w { z19.s }, p3/Z, [x26, #5, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x24, #3, MUL VL]\n"
+ "zip1 z25.s, z17.s, z16.s\n"
+ "zip2 z24.s, z17.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x24, #4, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x24, #5, MUL VL]\n"
+ "zip1 z12.s, z18.s, z17.s\n"
+ "zip2 z11.s, z18.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x25]\n"
+ "ld1w { z23.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "zip1 z10.s, z19.s, z16.s\n"
+ "zip2 z9.s, z19.s, z16.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z17.s }, p3/Z, [x22]\n"
+ ".inst 0x658aaea8 // bfcvt z8.h, p3/M, z21.s\n"
+ "zip1 z7.s, z18.s, z17.s\n"
+ "ld1w { z16.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x658aae86 // bfcvt z6.h, p3/M, z20.s\n"
+ "zip2 z5.s, z18.s, z17.s\n"
+ "ld1w { z20.s }, p3/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z19.s }, p3/Z, [x25, #4, MUL VL]\n"
+ ".inst 0x658aafa4 // bfcvt z4.h, p3/M, z29.s\n"
+ "zip1 z3.s, z23.s, z16.s\n"
+ "ld1w { z2.s }, p3/Z, [x25, #5, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x658aaf81 // bfcvt z1.h, p3/M, z28.s\n"
+ "zip2 z0.s, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x658aaf7f // bfcvt z31.h, p3/M, z27.s\n"
+ "zip1 z30.s, z22.s, z21.s\n"
+ ".inst 0x658aaf5d // bfcvt z29.h, p3/M, z26.s\n"
+ "zip2 z28.s, z22.s, z21.s\n"
"addvl x26, x26, #6\n"
- "addvl x23, x23, #6\n"
- ".inst 0x658aa821 // bfcvt z1.h, p2/M, z1.s\n"
- "zip2 z0.s, z31.s, z24.s\n"
+ "addvl x25, x25, #6\n"
+ ".inst 0x658aaf3b // bfcvt z27.h, p3/M, z25.s\n"
+ "zip1 z26.s, z20.s, z18.s\n"
+ "addvl x24, x24, #6\n"
"addvl x22, x22, #6\n"
- "addvl x21, x21, #6\n"
- ".inst 0x658aabdf // bfcvt z31.h, p2/M, z30.s\n"
- "zip1 z30.s, z28.s, z23.s\n"
- ".inst 0x658aabbd // bfcvt z29.h, p2/M, z29.s\n"
- "zip2 z28.s, z28.s, z23.s\n"
- ".inst 0x658aab5b // bfcvt z27.h, p2/M, z26.s\n"
- "zip1 z26.s, z21.s, z18.s\n"
- ".inst 0x658aab39 // bfcvt z25.h, p2/M, z25.s\n"
- "zip2 z24.s, z21.s, z18.s\n"
- ".inst 0x658aaad7 // bfcvt z23.h, p2/M, z22.s\n"
- "zip1 z22.s, z20.s, z17.s\n"
- ".inst 0x658aa955 // bfcvt z21.h, p2/M, z10.s\n"
- "zip2 z20.s, z20.s, z17.s\n"
- ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
- "zip1 z18.s, z8.s, z16.s\n"
- ".inst 0x658aa931 // bfcvt z17.h, p2/M, z9.s\n"
- "zip2 z16.s, z8.s, z16.s\n"
- ".inst 0x648aa8c7 // bfcvtnt z7.h, p2/M, z6.s\n"
- ".inst 0x648aa885 // bfcvtnt z5.h, p2/M, z4.s\n"
- ".inst 0x648aa843 // bfcvtnt z3.h, p2/M, z2.s\n"
- ".inst 0x648aa801 // bfcvtnt z1.h, p2/M, z0.s\n"
- ".inst 0x648aabdf // bfcvtnt z31.h, p2/M, z30.s\n"
- ".inst 0x648aab9d // bfcvtnt z29.h, p2/M, z28.s\n"
- "st1h { z7.h }, p2, [x24]\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aab5b // bfcvtnt z27.h, p2/M, z26.s\n"
- ".inst 0x648aab19 // bfcvtnt z25.h, p2/M, z24.s\n"
- "st1h { z3.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaad7 // bfcvtnt z23.h, p2/M, z22.s\n"
- ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
- "st1h { z1.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa53 // bfcvtnt z19.h, p2/M, z18.s\n"
- ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
- "st1h { z31.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z29.h }, p2, [x24, #5, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
- "st1h { z27.h }, p2, [x24]\n"
- "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z23.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z17.h }, p2, [x24, #5, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ ".inst 0x658aaf19 // bfcvt z25.h, p3/M, z24.s\n"
+ "zip2 z24.s, z20.s, z18.s\n"
+ ".inst 0x658aad97 // bfcvt z23.h, p3/M, z12.s\n"
+ "zip1 z22.s, z19.s, z17.s\n"
+ ".inst 0x658aad75 // bfcvt z21.h, p3/M, z11.s\n"
+ "zip2 z20.s, z19.s, z17.s\n"
+ ".inst 0x658aad53 // bfcvt z19.h, p3/M, z10.s\n"
+ "zip1 z18.s, z2.s, z16.s\n"
+ ".inst 0x658aad31 // bfcvt z17.h, p3/M, z9.s\n"
+ "zip2 z16.s, z2.s, z16.s\n"
+ ".inst 0x648aace8 // bfcvtnt z8.h, p3/M, z7.s\n"
+ ".inst 0x648aaca6 // bfcvtnt z6.h, p3/M, z5.s\n"
+ "st1h { z8.h }, p3, [x21]\n"
+ ".inst 0x648aac64 // bfcvtnt z4.h, p3/M, z3.s\n"
+ ".inst 0x648aac01 // bfcvtnt z1.h, p3/M, z0.s\n"
+ "st1h { z6.h }, p3, [x21, #1, MUL VL]\n"
+ ".inst 0x648aafdf // bfcvtnt z31.h, p3/M, z30.s\n"
+ ".inst 0x648aaf9d // bfcvtnt z29.h, p3/M, z28.s\n"
+ "st1h { z4.h }, p3, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p3, [x21, #3, MUL VL]\n"
+ ".inst 0x648aaf5b // bfcvtnt z27.h, p3/M, z26.s\n"
+ ".inst 0x648aaf19 // bfcvtnt z25.h, p3/M, z24.s\n"
+ "st1h { z31.h }, p3, [x21, #4, MUL VL]\n"
+ ".inst 0x648aaed7 // bfcvtnt z23.h, p3/M, z22.s\n"
+ ".inst 0x648aae95 // bfcvtnt z21.h, p3/M, z20.s\n"
+ "st1h { z29.h }, p3, [x21, #5, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
+ ".inst 0x648aae53 // bfcvtnt z19.h, p3/M, z18.s\n"
+ ".inst 0x648aae11 // bfcvtnt z17.h, p3/M, z16.s\n"
+ "st1h { z27.h }, p3, [x21]\n"
+ "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z23.h }, p3, [x21, #2, MUL VL]\n"
+ "st1h { z21.h }, p3, [x21, #3, MUL VL]\n"
+ "st1h { z19.h }, p3, [x21, #4, MUL VL]\n"
+ "st1h { z17.h }, p3, [x21, #5, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "mov x20, x25\n"
- "decd x25, ALL, MUL #6\n"
- "whilelt p0.s, XZR, x20\n"
+ "mov x20, x23\n"
+ "whilelt p2.s, XZR, x20\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "ld1w { z19.s }, p2/Z, [x24]\n"
"decw x20\n"
"whilelt p1.s, XZR, x20\n"
+ "ld1w { z18.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
"decw x20\n"
- "ld1w { z17.s }, p0/Z, [x26]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- "ld1w { z23.s }, p0/Z, [x23]\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
"whilelt p0.s, XZR, x20\n"
- "cmp x25, #0x0\n"
- "ld1w { z22.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z25.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x25]\n"
+ "ld1w { z30.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "zip1 z23.s, z20.s, z19.s\n"
+ "zip2 z22.s, z20.s, z19.s\n"
+ "ld1w { z29.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x22]\n"
+ "zip1 z20.s, z18.s, z17.s\n"
+ "zip2 z19.s, z18.s, z17.s\n"
"ld1w { z18.s }, p1/Z, [x22, #1, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1w { z30.s }, p1/Z, [x21, #1, MUL VL]\n"
- "zip1 z21.s, z17.s, z16.s\n"
- "zip2 z17.s, z17.s, z16.s\n"
- "ld1w { z20.s }, p0/Z, [x26, #2, MUL VL]\n"
- "ld1w { z16.s }, p0/Z, [x22, #2, MUL VL]\n"
- "zip1 z29.s, z23.s, z19.s\n"
- "zip2 z28.s, z23.s, z19.s\n"
- "ld1w { z27.s }, p0/Z, [x23, #2, MUL VL]\n"
- "ld1w { z26.s }, p0/Z, [x21, #2, MUL VL]\n"
- "zip1 z19.s, z22.s, z18.s\n"
- "zip2 z18.s, z22.s, z18.s\n"
- ".inst 0x658aaab9 // bfcvt z25.h, p2/M, z21.s\n"
- ".inst 0x658aaa38 // bfcvt z24.h, p2/M, z17.s\n"
+ "ld1w { z28.s }, p0/Z, [x22, #2, MUL VL]\n"
+ "zip1 z17.s, z25.s, z16.s\n"
+ "zip2 z16.s, z25.s, z16.s\n"
+ "decd x23, ALL, MUL #6\n"
+ ".inst 0x658aaefb // bfcvt z27.h, p3/M, z23.s\n"
+ "zip1 z26.s, z24.s, z21.s\n"
+ "cmp x23, #0x0\n"
+ ".inst 0x658aaed9 // bfcvt z25.h, p3/M, z22.s\n"
+ "zip2 z24.s, z24.s, z21.s\n"
"addvl x26, x26, #3\n"
- "addvl x23, x23, #3\n"
- "zip1 z17.s, z20.s, z16.s\n"
- "zip2 z16.s, z20.s, z16.s\n"
+ "addvl x25, x25, #3\n"
+ ".inst 0x658aae97 // bfcvt z23.h, p3/M, z20.s\n"
+ "zip1 z22.s, z30.s, z18.s\n"
+ "addvl x24, x24, #3\n"
"addvl x22, x22, #3\n"
- "addvl x21, x21, #3\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
- "zip1 z22.s, z31.s, z30.s\n"
- ".inst 0x658aaa55 // bfcvt z21.h, p2/M, z18.s\n"
- "zip2 z20.s, z31.s, z30.s\n"
- ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
- "zip1 z18.s, z27.s, z26.s\n"
- ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
- "zip2 z16.s, z27.s, z26.s\n"
- ".inst 0x648aabb9 // bfcvtnt z25.h, p2/M, z29.s\n"
- ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
- ".inst 0x648aaad7 // bfcvtnt z23.h, p2/M, z22.s\n"
- ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
- ".inst 0x648aaa53 // bfcvtnt z19.h, p2/M, z18.s\n"
- ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
- "st1h { z25.h }, p2, [x24]\n"
- "st1h { z24.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z23.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z17.h }, p2, [x24, #5, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ ".inst 0x658aae75 // bfcvt z21.h, p3/M, z19.s\n"
+ "zip2 z20.s, z30.s, z18.s\n"
+ ".inst 0x658aae33 // bfcvt z19.h, p3/M, z17.s\n"
+ "zip1 z18.s, z29.s, z28.s\n"
+ ".inst 0x658aae11 // bfcvt z17.h, p3/M, z16.s\n"
+ "zip2 z16.s, z29.s, z28.s\n"
+ ".inst 0x648aaf5b // bfcvtnt z27.h, p3/M, z26.s\n"
+ ".inst 0x648aaf19 // bfcvtnt z25.h, p3/M, z24.s\n"
+ "st1h { z27.h }, p3, [x21]\n"
+ ".inst 0x648aaed7 // bfcvtnt z23.h, p3/M, z22.s\n"
+ ".inst 0x648aae95 // bfcvtnt z21.h, p3/M, z20.s\n"
+ "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
+ ".inst 0x648aae53 // bfcvtnt z19.h, p3/M, z18.s\n"
+ ".inst 0x648aae11 // bfcvtnt z17.h, p3/M, z16.s\n"
+ "st1h { z23.h }, p3, [x21, #2, MUL VL]\n"
+ "st1h { z21.h }, p3, [x21, #3, MUL VL]\n"
+ "st1h { z19.h }, p3, [x21, #4, MUL VL]\n"
+ "st1h { z17.h }, p3, [x21, #5, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -216,7 +216,7 @@ void sve_transpose_interleave_6VL_2x4_fp32bf16(bfloat16 *out, const float *in, s
"bge 1b\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
index 314e3aebbd..ac61301ea4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
@@ -40,160 +40,160 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "ptrue p2.b\n"
+ "ptrue p3.b\n"
"blt 6f\n"
"1:" // Main row loop: Head
"mov x28, %x[in]\n"
"mov x27, %x[width]\n"
"cntw x26, ALL, MUL #6\n"
- "mov x25, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x24, x28, %x[in_stride]\n"
+ "add x25, x28, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
"cmp x27, x26\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z18.s }, p2/Z, [x28]\n"
- "ld1w { z28.s }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x21, x25\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1w { z22.s }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1w { z17.s }, p2/Z, [x24]\n"
- "mov x20, x25\n"
+ "ld1w { z18.s }, p3/Z, [x28]\n"
+ "ld1w { z17.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1w { z19.s }, p3/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x25]\n"
+ "zip1 z9.s, z18.s, z16.s\n"
+ "zip2 z8.s, z18.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x25, #2, MUL VL]\n"
+ "zip1 z7.s, z17.s, z16.s\n"
+ "zip2 z6.s, z17.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x24]\n"
+ "ld1w { z16.s }, p3/Z, [x23]\n"
+ "zip1 z5.s, z19.s, z18.s\n"
+ "zip2 z4.s, z19.s, z18.s\n"
+ "ld1w { z18.s }, p3/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x28, #4, MUL VL]\n"
+ "zip1 z3.s, z17.s, z16.s\n"
+ "zip2 z2.s, z17.s, z16.s\n"
+ "ld1w { z20.s }, p3/Z, [x28, #5, MUL VL]\n"
+ "ld1w { z17.s }, p3/Z, [x25, #3, MUL VL]\n"
+ "mov x20, x22\n"
+ "zip1 z1.s, z18.s, z17.s\n"
+ "ld1w { z19.s }, p3/Z, [x25, #4, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x25, #5, MUL VL]\n"
+ "zip2 z0.s, z18.s, z17.s\n"
+ "zip1 z31.s, z21.s, z19.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z17.s }, p3/Z, [x24, #2, MUL VL]\n"
+ "zip2 z30.s, z21.s, z19.s\n"
+ "zip1 z29.s, z20.s, z16.s\n"
+ "ld1w { z19.s }, p3/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z28.s }, p3/Z, [x24, #4, MUL VL]\n"
+ "zip2 z27.s, z20.s, z16.s\n"
"sub x27, x27, x26\n"
- "ld1w { z16.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x24, #5, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "zip1 z25.s, z18.s, z16.s\n"
+ "zip2 z24.s, z18.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x23, #3, MUL VL]\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "zip2 z22.s, z17.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x23, #4, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x23, #5, MUL VL]\n"
+ "st1w { z9.s }, p3, [x21]\n"
+ "zip1 z21.s, z19.s, z18.s\n"
+ "st1w { z8.s }, p3, [x21, #1, MUL VL]\n"
+ "zip2 z20.s, z19.s, z18.s\n"
"cmp x27, x26\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1w { z27.s }, p2/Z, [x23]\n"
- "ld1w { z26.s }, p2/Z, [x22]\n"
- "ld1w { z20.s }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x28, #4, MUL VL]\n"
- "zip1 z8.s, z18.s, z17.s\n"
- "zip2 z25.s, z18.s, z17.s\n"
- "ld1w { z24.s }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #3, MUL VL]\n"
- "zip1 z23.s, z28.s, z16.s\n"
- "zip2 z7.s, z28.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x24, #4, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x24, #5, MUL VL]\n"
- "zip1 z6.s, z22.s, z21.s\n"
- "zip2 z5.s, z22.s, z21.s\n"
- "ld1w { z22.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x23, #2, MUL VL]\n"
- "zip1 z4.s, z27.s, z26.s\n"
- "zip2 z3.s, z27.s, z26.s\n"
- "ld1w { z2.s }, p2/Z, [x23, #3, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z0.s, z20.s, z18.s\n"
- "zip2 z31.s, z20.s, z18.s\n"
- "ld1w { z30.s }, p2/Z, [x23, #5, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x22, #1, MUL VL]\n"
- "zip1 z29.s, z19.s, z17.s\n"
- "zip2 z28.s, z19.s, z17.s\n"
- "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x22, #3, MUL VL]\n"
- "zip1 z27.s, z24.s, z16.s\n"
- "zip2 z26.s, z24.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x22, #4, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x22, #5, MUL VL]\n"
- "st1w { z8.s }, p2, [x21]\n"
"addvl x28, x28, #6\n"
- "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
- "zip1 z25.s, z22.s, z20.s\n"
- "zip2 z24.s, z22.s, z20.s\n"
+ "st1w { z7.s }, p3, [x21, #2, MUL VL]\n"
+ "addvl x25, x25, #6\n"
"addvl x24, x24, #6\n"
- "st1w { z23.s }, p2, [x21, #2, MUL VL]\n"
- "zip1 z23.s, z21.s, z19.s\n"
- "zip2 z22.s, z21.s, z19.s\n"
+ "zip1 z19.s, z28.s, z17.s\n"
+ "st1w { z6.s }, p3, [x21, #3, MUL VL]\n"
"addvl x23, x23, #6\n"
- "st1w { z7.s }, p2, [x21, #3, MUL VL]\n"
- "zip1 z21.s, z2.s, z18.s\n"
- "zip2 z20.s, z2.s, z18.s\n"
- "addvl x22, x22, #6\n"
- "st1w { z6.s }, p2, [x21, #4, MUL VL]\n"
- "zip1 z19.s, z1.s, z17.s\n"
- "zip2 z18.s, z1.s, z17.s\n"
- "st1w { z5.s }, p2, [x21, #5, MUL VL]\n"
- "zip1 z17.s, z30.s, z16.s\n"
- "zip2 z16.s, z30.s, z16.s\n"
- "st1w { z4.s }, p2, [x21, #6, MUL VL]\n"
- "st1w { z3.s }, p2, [x21, #7, MUL VL]\n"
+ "zip2 z18.s, z28.s, z17.s\n"
+ "zip1 z17.s, z26.s, z16.s\n"
+ "st1w { z5.s }, p3, [x21, #4, MUL VL]\n"
+ "zip2 z16.s, z26.s, z16.s\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1w { z4.s }, p3, [x21, #5, MUL VL]\n"
+ "st1w { z3.s }, p3, [x21, #6, MUL VL]\n"
+ "st1w { z2.s }, p3, [x21, #7, MUL VL]\n"
"addvl x21, x21, #12\n"
- "st1w { z25.s }, p2, [x21, #-4, MUL VL]\n"
- "st1w { z24.s }, p2, [x21, #-3, MUL VL]\n"
- "st1w { z23.s }, p2, [x21, #-2, MUL VL]\n"
- "st1w { z22.s }, p2, [x21, #-1, MUL VL]\n"
- "st1w { z0.s }, p2, [x20]\n"
- "st1w { z31.s }, p2, [x20, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z28.s }, p2, [x20, #3, MUL VL]\n"
- "st1w { z27.s }, p2, [x20, #4, MUL VL]\n"
- "st1w { z26.s }, p2, [x20, #5, MUL VL]\n"
- "st1w { z21.s }, p2, [x20, #6, MUL VL]\n"
- "st1w { z20.s }, p2, [x20, #7, MUL VL]\n"
+ "st1w { z25.s }, p3, [x21, #-4, MUL VL]\n"
+ "st1w { z24.s }, p3, [x21, #-3, MUL VL]\n"
+ "st1w { z23.s }, p3, [x21, #-2, MUL VL]\n"
+ "st1w { z22.s }, p3, [x21, #-1, MUL VL]\n"
+ "st1w { z1.s }, p3, [x20]\n"
+ "st1w { z0.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z31.s }, p3, [x20, #2, MUL VL]\n"
+ "st1w { z30.s }, p3, [x20, #3, MUL VL]\n"
+ "st1w { z29.s }, p3, [x20, #4, MUL VL]\n"
+ "st1w { z27.s }, p3, [x20, #5, MUL VL]\n"
+ "st1w { z21.s }, p3, [x20, #6, MUL VL]\n"
+ "st1w { z20.s }, p3, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
- "st1w { z19.s }, p2, [x20, #-4, MUL VL]\n"
- "st1w { z18.s }, p2, [x20, #-3, MUL VL]\n"
- "st1w { z17.s }, p2, [x20, #-2, MUL VL]\n"
- "st1w { z16.s }, p2, [x20, #-1, MUL VL]\n"
+ "st1w { z19.s }, p3, [x20, #-4, MUL VL]\n"
+ "st1w { z18.s }, p3, [x20, #-3, MUL VL]\n"
+ "st1w { z17.s }, p3, [x20, #-2, MUL VL]\n"
+ "st1w { z16.s }, p3, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cbz x27, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x27\n"
- "mov x20, x25\n"
+ "mov x20, x27\n"
+ "whilelt p2.s, XZR, x20\n"
+ "ld1w { z19.s }, p2/Z, [x28]\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
+ "decw x20\n"
+ "whilelt p1.s, XZR, x20\n"
+ "ld1w { z17.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z22.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z21.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x24]\n"
+ "ld1w { z27.s }, p2/Z, [x23]\n"
+ "mov x20, x22\n"
"decd x27, ALL, MUL #6\n"
- "add x25, x25, %x[out_stride]\n"
- "whilelt p0.s, XZR, x21\n"
- "decw x21\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z19.s }, p0/Z, [x28]\n"
- "ld1w { z18.s }, p0/Z, [x24]\n"
- "ld1w { z22.s }, p0/Z, [x23]\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
- "whilelt p0.s, XZR, x21\n"
- "cmp x27, #0x0\n"
- "ld1w { z21.s }, p1/Z, [x28, #1, MUL VL]\n"
- "ld1w { z16.s }, p1/Z, [x24, #1, MUL VL]\n"
- "ld1w { z28.s }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z25.s }, p0/Z, [x24, #2, MUL VL]\n"
"zip1 z20.s, z19.s, z18.s\n"
- "zip2 z26.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p0/Z, [x28, #2, MUL VL]\n"
- "ld1w { z18.s }, p0/Z, [x24, #2, MUL VL]\n"
- "zip1 z25.s, z22.s, z17.s\n"
- "zip2 z24.s, z22.s, z17.s\n"
+ "zip2 z19.s, z19.s, z18.s\n"
+ "ld1w { z24.s }, p1/Z, [x23, #1, MUL VL]\n"
"ld1w { z23.s }, p0/Z, [x23, #2, MUL VL]\n"
- "ld1w { z22.s }, p0/Z, [x22, #2, MUL VL]\n"
- "zip1 z17.s, z21.s, z16.s\n"
- "zip2 z16.s, z21.s, z16.s\n"
- "st1w { z20.s }, p2, [x20]\n"
+ "zip1 z18.s, z17.s, z16.s\n"
+ "zip2 z17.s, z17.s, z16.s\n"
+ "zip1 z16.s, z22.s, z21.s\n"
+ "zip2 z22.s, z22.s, z21.s\n"
+ "st1w { z20.s }, p3, [x20]\n"
+ "cmp x27, #0x0\n"
+ "zip1 z21.s, z28.s, z27.s\n"
+ "zip2 z20.s, z28.s, z27.s\n"
+ "st1w { z19.s }, p3, [x20, #1, MUL VL]\n"
"addvl x28, x28, #3\n"
+ "st1w { z18.s }, p3, [x20, #2, MUL VL]\n"
+ "addvl x25, x25, #3\n"
"addvl x24, x24, #3\n"
- "zip1 z21.s, z28.s, z27.s\n"
- "zip1 z20.s, z19.s, z18.s\n"
- "zip2 z19.s, z19.s, z18.s\n"
- "st1w { z26.s }, p2, [x20, #1, MUL VL]\n"
+ "zip1 z19.s, z26.s, z24.s\n"
+ "st1w { z17.s }, p3, [x20, #3, MUL VL]\n"
"addvl x23, x23, #3\n"
- "st1w { z17.s }, p2, [x20, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
- "zip2 z18.s, z28.s, z27.s\n"
- "zip1 z17.s, z23.s, z22.s\n"
- "st1w { z16.s }, p2, [x20, #3, MUL VL]\n"
- "zip2 z16.s, z23.s, z22.s\n"
- "st1w { z20.s }, p2, [x20, #4, MUL VL]\n"
- "st1w { z19.s }, p2, [x20, #5, MUL VL]\n"
- "st1w { z25.s }, p2, [x20, #6, MUL VL]\n"
- "st1w { z24.s }, p2, [x20, #7, MUL VL]\n"
+ "zip2 z18.s, z26.s, z24.s\n"
+ "zip1 z17.s, z25.s, z23.s\n"
+ "st1w { z16.s }, p3, [x20, #4, MUL VL]\n"
+ "zip2 z16.s, z25.s, z23.s\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1w { z22.s }, p3, [x20, #5, MUL VL]\n"
+ "st1w { z21.s }, p3, [x20, #6, MUL VL]\n"
+ "st1w { z20.s }, p3, [x20, #7, MUL VL]\n"
"addvl x20, x20, #12\n"
- "st1w { z21.s }, p2, [x20, #-4, MUL VL]\n"
- "st1w { z18.s }, p2, [x20, #-3, MUL VL]\n"
- "st1w { z17.s }, p2, [x20, #-2, MUL VL]\n"
- "st1w { z16.s }, p2, [x20, #-1, MUL VL]\n"
+ "st1w { z19.s }, p3, [x20, #-4, MUL VL]\n"
+ "st1w { z18.s }, p3, [x20, #-3, MUL VL]\n"
+ "st1w { z17.s }, p3, [x20, #-2, MUL VL]\n"
+ "st1w { z16.s }, p3, [x20, #-1, MUL VL]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -205,90 +205,90 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t
"mov x28, %x[in]\n"
"mov x21, %x[width]\n"
"cntw x20, ALL, MUL #6\n"
+ "add x25, x28, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x25, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x24, x28, %x[in_stride]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1w { z17.s }, p2/Z, [x28]\n"
- "ld1w { z23.s }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z17.s }, p3/Z, [x28]\n"
+ "ld1w { z19.s }, p3/Z, [x28, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1w { z29.s }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x24]\n"
"cmp x21, x20\n"
- "ld1w { z22.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z28.s }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x24, #3, MUL VL]\n"
- "zip1 z19.s, z17.s, z16.s\n"
- "zip2 z18.s, z17.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x24, #4, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x24, #5, MUL VL]\n"
- "zip1 z16.s, z23.s, z22.s\n"
- "zip2 z24.s, z23.s, z22.s\n"
- "zip1 z23.s, z29.s, z21.s\n"
- "zip2 z22.s, z29.s, z21.s\n"
+ "ld1w { z18.s }, p3/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x25]\n"
+ "zip1 z28.s, z17.s, z16.s\n"
+ "zip2 z20.s, z17.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x25, #2, MUL VL]\n"
+ "zip1 z27.s, z19.s, z17.s\n"
+ "zip2 z26.s, z19.s, z17.s\n"
+ "ld1w { z19.s }, p3/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z25.s }, p3/Z, [x28, #4, MUL VL]\n"
+ "zip1 z24.s, z18.s, z16.s\n"
+ "zip2 z23.s, z18.s, z16.s\n"
+ "ld1w { z22.s }, p3/Z, [x28, #5, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x25, #3, MUL VL]\n"
"addvl x28, x28, #6\n"
- "addvl x24, x24, #6\n"
- "st1w { z19.s }, p2, [x25]\n"
- "zip1 z21.s, z28.s, z20.s\n"
- "zip2 z20.s, z28.s, z20.s\n"
- "st1w { z18.s }, p2, [x25, #1, MUL VL]\n"
- "zip1 z19.s, z27.s, z17.s\n"
- "zip2 z18.s, z27.s, z17.s\n"
- "st1w { z16.s }, p2, [x25, #2, MUL VL]\n"
- "zip1 z17.s, z26.s, z25.s\n"
- "zip2 z16.s, z26.s, z25.s\n"
- "st1w { z24.s }, p2, [x25, #3, MUL VL]\n"
- "st1w { z23.s }, p2, [x25, #4, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #5, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
- "st1w { z21.s }, p2, [x25]\n"
- "st1w { z20.s }, p2, [x25, #1, MUL VL]\n"
- "st1w { z19.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #3, MUL VL]\n"
- "st1w { z17.s }, p2, [x25, #4, MUL VL]\n"
- "st1w { z16.s }, p2, [x25, #5, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "zip1 z21.s, z19.s, z18.s\n"
+ "ld1w { z17.s }, p3/Z, [x25, #4, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x25, #5, MUL VL]\n"
+ "st1w { z28.s }, p3, [x22]\n"
+ "addvl x25, x25, #6\n"
+ "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+ "zip2 z20.s, z19.s, z18.s\n"
+ "zip1 z19.s, z25.s, z17.s\n"
+ "st1w { z27.s }, p3, [x22, #2, MUL VL]\n"
+ "zip2 z18.s, z25.s, z17.s\n"
+ "zip1 z17.s, z22.s, z16.s\n"
+ "st1w { z26.s }, p3, [x22, #3, MUL VL]\n"
+ "zip2 z16.s, z22.s, z16.s\n"
+ "st1w { z24.s }, p3, [x22, #4, MUL VL]\n"
+ "st1w { z23.s }, p3, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1w { z21.s }, p3, [x22]\n"
+ "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z19.s }, p3, [x22, #2, MUL VL]\n"
+ "st1w { z18.s }, p3, [x22, #3, MUL VL]\n"
+ "st1w { z17.s }, p3, [x22, #4, MUL VL]\n"
+ "st1w { z16.s }, p3, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decd x21, ALL, MUL #6\n"
"whilelt p0.s, XZR, x20\n"
+ "ld1w { z20.s }, p0/Z, [x28]\n"
+ "ld1w { z19.s }, p0/Z, [x25]\n"
"decw x20\n"
- "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z18.s }, p0/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x25, #1, MUL VL]\n"
"decw x20\n"
- "ld1w { z17.s }, p0/Z, [x28]\n"
- "ld1w { z16.s }, p0/Z, [x24]\n"
"whilelt p0.s, XZR, x20\n"
+ "ld1w { z22.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "decd x21, ALL, MUL #6\n"
"cmp x21, #0x0\n"
- "ld1w { z22.s }, p1/Z, [x28, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #1, MUL VL]\n"
- "zip1 z21.s, z17.s, z16.s\n"
- "zip2 z17.s, z17.s, z16.s\n"
- "ld1w { z20.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "zip1 z21.s, z20.s, z19.s\n"
+ "zip2 z20.s, z20.s, z19.s\n"
"addvl x28, x28, #3\n"
- "ld1w { z16.s }, p0/Z, [x24, #2, MUL VL]\n"
- "addvl x24, x24, #3\n"
- "zip1 z19.s, z22.s, z18.s\n"
- "zip2 z18.s, z22.s, z18.s\n"
- "st1w { z21.s }, p2, [x25]\n"
- "st1w { z17.s }, p2, [x25, #1, MUL VL]\n"
- "zip1 z17.s, z20.s, z16.s\n"
- "zip2 z16.s, z20.s, z16.s\n"
- "st1w { z19.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z18.s }, p2, [x25, #3, MUL VL]\n"
- "st1w { z17.s }, p2, [x25, #4, MUL VL]\n"
- "st1w { z16.s }, p2, [x25, #5, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "addvl x25, x25, #3\n"
+ "zip1 z19.s, z18.s, z17.s\n"
+ "zip2 z18.s, z18.s, z17.s\n"
+ "zip1 z17.s, z22.s, z16.s\n"
+ "zip2 z16.s, z22.s, z16.s\n"
+ "st1w { z21.s }, p3, [x22]\n"
+ "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z19.s }, p3, [x22, #2, MUL VL]\n"
+ "st1w { z18.s }, p3, [x22, #3, MUL VL]\n"
+ "st1w { z17.s }, p3, [x22, #4, MUL VL]\n"
+ "st1w { z16.s }, p3, [x22, #5, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -297,7 +297,7 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t
"12:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
index ae228d3916..87d5372b57 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
@@ -34,152 +34,152 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt
__asm__ __volatile__(
"cmp %x[height], #0x2\n"
- "ptrue p2.b\n"
+ "ptrue p1.b\n"
"blt 6f\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
"mov x25, %x[width]\n"
"cntw x24, ALL, MUL #16\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x22, x26, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "add x23, x26, %x[in_stride]\n"
"cmp x25, x24\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z15.s }, p2/Z, [x26]\n"
- "ld1w { z14.s }, p2/Z, [x26, #1, MUL VL]\n"
- "mov x21, x23\n"
- "add x23, x23, %x[out_stride]\n"
- "ld1w { z13.s }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1w { z12.s }, p2/Z, [x26, #3, MUL VL]\n"
- "mov x20, x23\n"
+ "ld1w { z15.s }, p1/Z, [x26]\n"
+ "ld1w { z14.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1w { z13.s }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z12.s }, p1/Z, [x26, #3, MUL VL]\n"
+ "mov x20, x22\n"
"sub x25, x25, x24\n"
- "ld1w { z11.s }, p2/Z, [x26, #4, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x26, #5, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x26, #4, MUL VL]\n"
+ "ld1w { z10.s }, p1/Z, [x26, #5, MUL VL]\n"
"cmp x25, x24\n"
- "add x23, x23, %x[out_stride]\n"
- "ld1w { z9.s }, p2/Z, [x26, #6, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x26, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1w { z9.s }, p1/Z, [x26, #6, MUL VL]\n"
+ "ld1w { z8.s }, p1/Z, [x26, #7, MUL VL]\n"
"addvl x26, x26, #16\n"
- "ld1w { z7.s }, p2/Z, [x22]\n"
- "ld1w { z6.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x22, #3, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x22, #4, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x22, #5, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x22, #6, MUL VL]\n"
- "ld1w { z0.s }, p2/Z, [x22, #7, MUL VL]\n"
- "addvl x22, x22, #16\n"
- "ld1w { z31.s }, p2/Z, [x26, #-8, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x26, #-7, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [x26, #-6, MUL VL]\n"
- "ld1w { z28.s }, p2/Z, [x26, #-5, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x26, #-4, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x26, #-3, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x26, #-2, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x26, #-1, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [x22, #-8, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #-7, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x22, #-6, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x22, #-5, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x22, #-4, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x22, #-3, MUL VL]\n"
- "ld1w { z17.s }, p2/Z, [x22, #-2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x22, #-1, MUL VL]\n"
- "st1w { z15.s }, p2, [x21]\n"
- "st1w { z14.s }, p2, [x21, #1, MUL VL]\n"
- "st1w { z13.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z12.s }, p2, [x21, #3, MUL VL]\n"
- "st1w { z11.s }, p2, [x21, #4, MUL VL]\n"
- "st1w { z10.s }, p2, [x21, #5, MUL VL]\n"
- "st1w { z9.s }, p2, [x21, #6, MUL VL]\n"
- "st1w { z8.s }, p2, [x21, #7, MUL VL]\n"
+ "ld1w { z7.s }, p1/Z, [x23]\n"
+ "ld1w { z6.s }, p1/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z5.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z4.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z3.s }, p1/Z, [x23, #4, MUL VL]\n"
+ "ld1w { z2.s }, p1/Z, [x23, #5, MUL VL]\n"
+ "ld1w { z1.s }, p1/Z, [x23, #6, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x23, #7, MUL VL]\n"
+ "addvl x23, x23, #16\n"
+ "ld1w { z31.s }, p1/Z, [x26, #-8, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [x26, #-7, MUL VL]\n"
+ "ld1w { z29.s }, p1/Z, [x26, #-6, MUL VL]\n"
+ "ld1w { z28.s }, p1/Z, [x26, #-5, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x26, #-4, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x26, #-3, MUL VL]\n"
+ "ld1w { z25.s }, p1/Z, [x26, #-2, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x26, #-1, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x23, #-8, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x23, #-7, MUL VL]\n"
+ "ld1w { z21.s }, p1/Z, [x23, #-6, MUL VL]\n"
+ "ld1w { z20.s }, p1/Z, [x23, #-5, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #-4, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x23, #-3, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x23, #-1, MUL VL]\n"
+ "st1w { z15.s }, p1, [x21]\n"
+ "st1w { z14.s }, p1, [x21, #1, MUL VL]\n"
+ "st1w { z13.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z12.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z11.s }, p1, [x21, #4, MUL VL]\n"
+ "st1w { z10.s }, p1, [x21, #5, MUL VL]\n"
+ "st1w { z9.s }, p1, [x21, #6, MUL VL]\n"
+ "st1w { z8.s }, p1, [x21, #7, MUL VL]\n"
"addvl x21, x21, #16\n"
- "st1w { z7.s }, p2, [x21, #-8, MUL VL]\n"
- "st1w { z6.s }, p2, [x21, #-7, MUL VL]\n"
- "st1w { z5.s }, p2, [x21, #-6, MUL VL]\n"
- "st1w { z4.s }, p2, [x21, #-5, MUL VL]\n"
- "st1w { z3.s }, p2, [x21, #-4, MUL VL]\n"
- "st1w { z2.s }, p2, [x21, #-3, MUL VL]\n"
- "st1w { z1.s }, p2, [x21, #-2, MUL VL]\n"
- "st1w { z0.s }, p2, [x21, #-1, MUL VL]\n"
- "st1w { z31.s }, p2, [x20]\n"
- "st1w { z30.s }, p2, [x20, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z28.s }, p2, [x20, #3, MUL VL]\n"
- "st1w { z27.s }, p2, [x20, #4, MUL VL]\n"
- "st1w { z26.s }, p2, [x20, #5, MUL VL]\n"
- "st1w { z25.s }, p2, [x20, #6, MUL VL]\n"
- "st1w { z24.s }, p2, [x20, #7, MUL VL]\n"
+ "st1w { z7.s }, p1, [x21, #-8, MUL VL]\n"
+ "st1w { z6.s }, p1, [x21, #-7, MUL VL]\n"
+ "st1w { z5.s }, p1, [x21, #-6, MUL VL]\n"
+ "st1w { z4.s }, p1, [x21, #-5, MUL VL]\n"
+ "st1w { z3.s }, p1, [x21, #-4, MUL VL]\n"
+ "st1w { z2.s }, p1, [x21, #-3, MUL VL]\n"
+ "st1w { z1.s }, p1, [x21, #-2, MUL VL]\n"
+ "st1w { z0.s }, p1, [x21, #-1, MUL VL]\n"
+ "st1w { z31.s }, p1, [x20]\n"
+ "st1w { z30.s }, p1, [x20, #1, MUL VL]\n"
+ "st1w { z29.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z28.s }, p1, [x20, #3, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #4, MUL VL]\n"
+ "st1w { z26.s }, p1, [x20, #5, MUL VL]\n"
+ "st1w { z25.s }, p1, [x20, #6, MUL VL]\n"
+ "st1w { z24.s }, p1, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1w { z23.s }, p2, [x20, #-8, MUL VL]\n"
- "st1w { z22.s }, p2, [x20, #-7, MUL VL]\n"
- "st1w { z21.s }, p2, [x20, #-6, MUL VL]\n"
- "st1w { z20.s }, p2, [x20, #-5, MUL VL]\n"
- "st1w { z19.s }, p2, [x20, #-4, MUL VL]\n"
- "st1w { z18.s }, p2, [x20, #-3, MUL VL]\n"
- "st1w { z17.s }, p2, [x20, #-2, MUL VL]\n"
- "st1w { z16.s }, p2, [x20, #-1, MUL VL]\n"
+ "st1w { z23.s }, p1, [x20, #-8, MUL VL]\n"
+ "st1w { z22.s }, p1, [x20, #-7, MUL VL]\n"
+ "st1w { z21.s }, p1, [x20, #-6, MUL VL]\n"
+ "st1w { z20.s }, p1, [x20, #-5, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #-4, MUL VL]\n"
+ "st1w { z18.s }, p1, [x20, #-3, MUL VL]\n"
+ "st1w { z17.s }, p1, [x20, #-2, MUL VL]\n"
+ "st1w { z16.s }, p1, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cbz x25, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x25\n"
- "mov x20, x23\n"
- "decw x25, ALL, MUL #8\n"
- "add x23, x23, %x[out_stride]\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
- "whilelt p0.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z31.s }, p1/Z, [x26]\n"
- "ld1w { z30.s }, p1/Z, [x22]\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
+ "mov x20, x25\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z31.s }, p0/Z, [x26]\n"
+ "ld1w { z30.s }, p0/Z, [x23]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
"ld1w { z29.s }, p0/Z, [x26, #1, MUL VL]\n"
- "ld1w { z28.s }, p0/Z, [x22, #1, MUL VL]\n"
- "whilelt p0.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z27.s }, p1/Z, [x26, #2, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
+ "ld1w { z28.s }, p0/Z, [x23, #1, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z27.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z26.s }, p0/Z, [x23, #2, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
"ld1w { z25.s }, p0/Z, [x26, #3, MUL VL]\n"
- "ld1w { z24.s }, p0/Z, [x22, #3, MUL VL]\n"
- "whilelt p0.s, XZR, x21\n"
- "decw x21\n"
- "ld1w { z23.s }, p1/Z, [x26, #4, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x22, #4, MUL VL]\n"
- "whilelt p1.s, XZR, x21\n"
- "decw x21\n"
+ "ld1w { z24.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z23.s }, p0/Z, [x26, #4, MUL VL]\n"
+ "ld1w { z22.s }, p0/Z, [x23, #4, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
"ld1w { z21.s }, p0/Z, [x26, #5, MUL VL]\n"
- "ld1w { z20.s }, p0/Z, [x22, #5, MUL VL]\n"
- "whilelt p0.s, XZR, x21\n"
- "cmp x25, #0x0\n"
- "ld1w { z19.s }, p1/Z, [x26, #6, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x22, #6, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [x23, #5, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z19.s }, p0/Z, [x26, #6, MUL VL]\n"
+ "ld1w { z18.s }, p0/Z, [x23, #6, MUL VL]\n"
+ "decw x20\n"
+ "whilelt p0.s, XZR, x20\n"
"ld1w { z17.s }, p0/Z, [x26, #7, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x23, #7, MUL VL]\n"
+ "mov x20, x22\n"
+ "decw x25, ALL, MUL #8\n"
+ "st1w { z31.s }, p1, [x20]\n"
+ "st1w { z29.s }, p1, [x20, #1, MUL VL]\n"
+ "cmp x25, #0x0\n"
"addvl x26, x26, #8\n"
- "ld1w { z16.s }, p0/Z, [x22, #7, MUL VL]\n"
- "st1w { z31.s }, p2, [x20]\n"
- "st1w { z29.s }, p2, [x20, #1, MUL VL]\n"
- "addvl x22, x22, #8\n"
- "st1w { z27.s }, p2, [x20, #2, MUL VL]\n"
- "st1w { z25.s }, p2, [x20, #3, MUL VL]\n"
- "st1w { z23.s }, p2, [x20, #4, MUL VL]\n"
- "st1w { z21.s }, p2, [x20, #5, MUL VL]\n"
- "st1w { z19.s }, p2, [x20, #6, MUL VL]\n"
- "st1w { z17.s }, p2, [x20, #7, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #2, MUL VL]\n"
+ "addvl x23, x23, #8\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1w { z25.s }, p1, [x20, #3, MUL VL]\n"
+ "st1w { z23.s }, p1, [x20, #4, MUL VL]\n"
+ "st1w { z21.s }, p1, [x20, #5, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #6, MUL VL]\n"
+ "st1w { z17.s }, p1, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1w { z30.s }, p2, [x20, #-8, MUL VL]\n"
- "st1w { z28.s }, p2, [x20, #-7, MUL VL]\n"
- "st1w { z26.s }, p2, [x20, #-6, MUL VL]\n"
- "st1w { z24.s }, p2, [x20, #-5, MUL VL]\n"
- "st1w { z22.s }, p2, [x20, #-4, MUL VL]\n"
- "st1w { z20.s }, p2, [x20, #-3, MUL VL]\n"
- "st1w { z18.s }, p2, [x20, #-2, MUL VL]\n"
- "st1w { z16.s }, p2, [x20, #-1, MUL VL]\n"
+ "st1w { z30.s }, p1, [x20, #-8, MUL VL]\n"
+ "st1w { z28.s }, p1, [x20, #-7, MUL VL]\n"
+ "st1w { z26.s }, p1, [x20, #-6, MUL VL]\n"
+ "st1w { z24.s }, p1, [x20, #-5, MUL VL]\n"
+ "st1w { z22.s }, p1, [x20, #-4, MUL VL]\n"
+ "st1w { z20.s }, p1, [x20, #-3, MUL VL]\n"
+ "st1w { z18.s }, p1, [x20, #-2, MUL VL]\n"
+ "st1w { z16.s }, p1, [x20, #-1, MUL VL]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x2\n"
@@ -191,89 +191,89 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt
"mov x21, %x[width]\n"
"cntw x20, ALL, MUL #16\n"
"mov x26, %x[in]\n"
- "mov x23, %x[out]\n"
- "sub %x[height], %x[height], #0x1\n"
"cmp x21, x20\n"
"add %x[in], x26, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1w { z31.s }, p2/Z, [x26]\n"
- "ld1w { z30.s }, p2/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x26]\n"
+ "ld1w { z30.s }, p1/Z, [x26, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1w { z29.s }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1w { z28.s }, p2/Z, [x26, #3, MUL VL]\n"
"cmp x21, x20\n"
- "ld1w { z27.s }, p2/Z, [x26, #4, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x26, #5, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x26, #6, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x26, #7, MUL VL]\n"
+ "ld1w { z29.s }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z28.s }, p1/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x26, #4, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x26, #5, MUL VL]\n"
+ "ld1w { z25.s }, p1/Z, [x26, #6, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x26, #7, MUL VL]\n"
"addvl x26, x26, #16\n"
- "ld1w { z23.s }, p2/Z, [x26, #-8, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x26, #-7, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x26, #-6, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x26, #-5, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x26, #-4, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x26, #-3, MUL VL]\n"
- "ld1w { z17.s }, p2/Z, [x26, #-2, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x26, #-1, MUL VL]\n"
- "st1w { z31.s }, p2, [x23]\n"
- "st1w { z30.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z29.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z28.s }, p2, [x23, #3, MUL VL]\n"
- "st1w { z27.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #5, MUL VL]\n"
- "st1w { z25.s }, p2, [x23, #6, MUL VL]\n"
- "st1w { z24.s }, p2, [x23, #7, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
- "st1w { z23.s }, p2, [x23]\n"
- "st1w { z22.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z20.s }, p2, [x23, #3, MUL VL]\n"
- "st1w { z19.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #5, MUL VL]\n"
- "st1w { z17.s }, p2, [x23, #6, MUL VL]\n"
- "st1w { z16.s }, p2, [x23, #7, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
+ "ld1w { z23.s }, p1/Z, [x26, #-8, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x26, #-7, MUL VL]\n"
+ "ld1w { z21.s }, p1/Z, [x26, #-6, MUL VL]\n"
+ "ld1w { z20.s }, p1/Z, [x26, #-5, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x26, #-4, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x26, #-3, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [x26, #-2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #-1, MUL VL]\n"
+ "st1w { z31.s }, p1, [x22]\n"
+ "st1w { z30.s }, p1, [x22, #1, MUL VL]\n"
+ "st1w { z29.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z28.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z27.s }, p1, [x22, #4, MUL VL]\n"
+ "st1w { z26.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z25.s }, p1, [x22, #6, MUL VL]\n"
+ "st1w { z24.s }, p1, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1w { z23.s }, p1, [x22]\n"
+ "st1w { z22.s }, p1, [x22, #1, MUL VL]\n"
+ "st1w { z21.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #4, MUL VL]\n"
+ "st1w { z18.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z17.s }, p1, [x22, #6, MUL VL]\n"
+ "st1w { z16.s }, p1, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #8\n"
- "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z23.s }, p0/Z, [x26]\n"
"decw x20\n"
"whilelt p0.s, XZR, x20\n"
+ "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
"decw x20\n"
- "ld1w { z23.s }, p1/Z, [x26]\n"
- "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z21.s }, p0/Z, [x26, #2, MUL VL]\n"
"decw x20\n"
- "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
"whilelt p0.s, XZR, x20\n"
+ "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
"decw x20\n"
- "ld1w { z21.s }, p1/Z, [x26, #2, MUL VL]\n"
- "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z19.s }, p0/Z, [x26, #4, MUL VL]\n"
"decw x20\n"
- "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
"whilelt p0.s, XZR, x20\n"
+ "ld1w { z18.s }, p0/Z, [x26, #5, MUL VL]\n"
"decw x20\n"
- "ld1w { z19.s }, p1/Z, [x26, #4, MUL VL]\n"
- "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, XZR, x20\n"
+ "ld1w { z17.s }, p0/Z, [x26, #6, MUL VL]\n"
"decw x20\n"
- "ld1w { z18.s }, p0/Z, [x26, #5, MUL VL]\n"
+ "decw x21, ALL, MUL #8\n"
"whilelt p0.s, XZR, x20\n"
"cmp x21, #0x0\n"
- "ld1w { z17.s }, p1/Z, [x26, #6, MUL VL]\n"
"ld1w { z16.s }, p0/Z, [x26, #7, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22]\n"
"addvl x26, x26, #8\n"
- "st1w { z23.s }, p2, [x23]\n"
- "st1w { z22.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z20.s }, p2, [x23, #3, MUL VL]\n"
- "st1w { z19.s }, p2, [x23, #4, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #5, MUL VL]\n"
- "st1w { z17.s }, p2, [x23, #6, MUL VL]\n"
- "st1w { z16.s }, p2, [x23, #7, MUL VL]\n"
- "add x23, x23, %x[out_stride]\n"
+ "st1w { z22.s }, p1, [x22, #1, MUL VL]\n"
+ "st1w { z21.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #4, MUL VL]\n"
+ "st1w { z18.s }, p1, [x22, #5, MUL VL]\n"
+ "st1w { z17.s }, p1, [x22, #6, MUL VL]\n"
+ "st1w { z16.s }, p1, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -282,7 +282,7 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt
"12:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
index 03d192c874..35e60a223c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
@@ -42,207 +42,207 @@ void sve_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p2.b\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
"cntb x20, ALL, MUL #8\n"
+ "add x22, x24, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x24, %x[out]\n"
- "add x23, x26, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add %x[in], x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GT\n"
- "csel x22, x22, %x[pad_row], GE\n"
+ "add %x[in], x22, %x[in_stride]\n"
+ "csel x22, x22, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x23, x23, %x[pad_row], GT\n"
- "cmp x25, x20\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "cmp x23, x20\n"
+ "mov x21, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z22.b }, p2/Z, [x26]\n"
- "ld1b { z14.b }, p2/Z, [x26, #1, MUL VL]\n"
- "sub x25, x25, x20\n"
- "ld1b { z2.b }, p2/Z, [x23]\n"
- "ld1b { z10.b }, p2/Z, [x23, #1, MUL VL]\n"
- "cmp x25, x20\n"
- "ld1b { z27.b }, p2/Z, [x22]\n"
- "ld1b { z21.b }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1b { z13.b }, p2/Z, [x21]\n"
- "ld1b { z28.b }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1b { z11.b }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x26, #3, MUL VL]\n"
- "ld1b { z23.b }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x23, #3, MUL VL]\n"
- "zip1 z24.b, z22.b, z27.b\n"
- "zip2 z19.b, z22.b, z27.b\n"
- "ld1b { z5.b }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1b { z3.b }, p2/Z, [x22, #3, MUL VL]\n"
- "zip1 z6.b, z2.b, z13.b\n"
- "zip2 z20.b, z2.b, z13.b\n"
- "ld1b { z25.b }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1b { z30.b }, p2/Z, [x21, #3, MUL VL]\n"
- "zip1 z18.b, z14.b, z21.b\n"
- "zip1 z16.b, z10.b, z28.b\n"
- "ld1b { z26.b }, p2/Z, [x26, #4, MUL VL]\n"
- "ld1b { z22.b }, p2/Z, [x26, #5, MUL VL]\n"
- "zip2 z17.b, z14.b, z21.b\n"
- "zip2 z10.b, z10.b, z28.b\n"
- "ld1b { z27.b }, p2/Z, [x23, #4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x23, #5, MUL VL]\n"
- "zip1 z21.b, z11.b, z5.b\n"
- "zip2 z12.b, z11.b, z5.b\n"
- "ld1b { z2.b }, p2/Z, [x22, #4, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x22, #5, MUL VL]\n"
- "zip1 z13.b, z23.b, z25.b\n"
- "zip2 z31.b, z23.b, z25.b\n"
- "ld1b { z25.b }, p2/Z, [x21, #4, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x21, #5, MUL VL]\n"
- "zip1 z15.b, z9.b, z3.b\n"
- "zip1 z11.b, z29.b, z30.b\n"
- "ld1b { z14.b }, p2/Z, [x26, #6, MUL VL]\n"
- "ld1b { z1.b }, p2/Z, [x26, #7, MUL VL]\n"
- "zip2 z23.b, z9.b, z3.b\n"
- "zip2 z0.b, z29.b, z30.b\n"
- "ld1b { z3.b }, p2/Z, [x23, #6, MUL VL]\n"
- "ld1b { z29.b }, p2/Z, [x23, #7, MUL VL]\n"
- "zip1 z9.b, z26.b, z2.b\n"
- "zip2 z26.b, z26.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [x22, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x22, #7, MUL VL]\n"
- "zip1 z30.b, z27.b, z25.b\n"
- "zip2 z28.b, z27.b, z25.b\n"
- "ld1b { z27.b }, p2/Z, [x21, #6, MUL VL]\n"
- "zip1 z25.b, z22.b, z5.b\n"
- "zip2 z5.b, z22.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x26]\n"
+ "ld1b { z24.b }, p2/Z, [x26, #1, MUL VL]\n"
+ "sub x23, x23, x20\n"
+ "cmp x23, x20\n"
+ "ld1b { z31.b }, p2/Z, [x25]\n"
+ "ld1b { z18.b }, p2/Z, [x25, #1, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [x24]\n"
+ "ld1b { z25.b }, p2/Z, [x24, #1, MUL VL]\n"
+ "zip1 z23.b, z7.b, z19.b\n"
+ "zip2 z20.b, z7.b, z19.b\n"
+ "ld1b { z30.b }, p2/Z, [x22]\n"
+ "ld1b { z3.b }, p2/Z, [x22, #1, MUL VL]\n"
+ "zip1 z21.b, z31.b, z30.b\n"
+ "zip2 z19.b, z31.b, z30.b\n"
+ "ld1b { z16.b }, p2/Z, [x26, #2, MUL VL]\n"
+ "ld1b { z30.b }, p2/Z, [x26, #3, MUL VL]\n"
+ "zip1 z2.b, z24.b, z25.b\n"
+ "zip1 z17.b, z18.b, z3.b\n"
+ "ld1b { z29.b }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x25, #3, MUL VL]\n"
+ "zip2 z22.b, z24.b, z25.b\n"
+ "zip2 z4.b, z18.b, z3.b\n"
+ "ld1b { z0.b }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1b { z3.b }, p2/Z, [x24, #3, MUL VL]\n"
+ "zip1 z9.b, z16.b, z0.b\n"
+ "zip2 z14.b, z16.b, z0.b\n"
+ "ld1b { z18.b }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x22, #3, MUL VL]\n"
+ "zip1 z24.b, z29.b, z18.b\n"
+ "zip2 z11.b, z29.b, z18.b\n"
+ "ld1b { z1.b }, p2/Z, [x26, #4, MUL VL]\n"
+ "ld1b { z12.b }, p2/Z, [x26, #5, MUL VL]\n"
+ "zip1 z13.b, z30.b, z3.b\n"
+ "zip1 z15.b, z8.b, z16.b\n"
+ "ld1b { z5.b }, p2/Z, [x25, #4, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x25, #5, MUL VL]\n"
+ "zip2 z31.b, z30.b, z3.b\n"
+ "zip2 z30.b, z8.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x24, #4, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [x24, #5, MUL VL]\n"
+ "zip1 z27.b, z1.b, z16.b\n"
+ "zip2 z10.b, z1.b, z16.b\n"
+ "ld1b { z7.b }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x22, #5, MUL VL]\n"
+ "zip1 z8.b, z5.b, z7.b\n"
+ "zip2 z26.b, z5.b, z7.b\n"
+ "ld1b { z3.b }, p2/Z, [x26, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x26, #7, MUL VL]\n"
+ "zip1 z6.b, z12.b, z18.b\n"
+ "zip1 z5.b, z29.b, z16.b\n"
+ "ld1b { z0.b }, p2/Z, [x25, #6, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x25, #7, MUL VL]\n"
+ "zip2 z12.b, z12.b, z18.b\n"
+ "zip2 z7.b, z29.b, z16.b\n"
+ "ld1b { z1.b }, p2/Z, [x24, #6, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x24, #7, MUL VL]\n"
+ "zip1 z16.b, z23.b, z21.b\n"
+ "zip2 z18.b, z23.b, z21.b\n"
+ "ld1b { z23.b }, p2/Z, [x22, #6, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x22, #7, MUL VL]\n"
+ "st1b { z16.b }, p2, [x21]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
+ "zip2 z20.b, z20.b, z19.b\n"
+ "zip1 z19.b, z2.b, z17.b\n"
+ "st1b { z18.b }, p2, [x21, #1, MUL VL]\n"
"addvl x26, x26, #8\n"
- "zip1 z22.b, z7.b, z8.b\n"
- "zip2 z7.b, z7.b, z8.b\n"
- "addvl x23, x23, #8\n"
+ "zip2 z18.b, z2.b, z17.b\n"
+ "zip1 z17.b, z22.b, z4.b\n"
+ "st1b { z16.b }, p2, [x21, #2, MUL VL]\n"
+ "addvl x25, x25, #8\n"
+ "zip2 z16.b, z22.b, z4.b\n"
+ "st1b { z20.b }, p2, [x21, #3, MUL VL]\n"
+ "zip1 z4.b, z3.b, z1.b\n"
+ "addvl x24, x24, #8\n"
+ "st1b { z19.b }, p2, [x21, #4, MUL VL]\n"
+ "zip1 z22.b, z0.b, z23.b\n"
+ "zip2 z3.b, z3.b, z1.b\n"
"addvl x22, x22, #8\n"
- "zip1 z8.b, z24.b, z6.b\n"
- "zip2 z6.b, z24.b, z6.b\n"
- "ld1b { z24.b }, p2/Z, [x21, #7, MUL VL]\n"
- "addvl x21, x21, #8\n"
- "st1b { z8.b }, p2, [x24]\n"
- "zip1 z8.b, z19.b, z20.b\n"
- "zip2 z20.b, z19.b, z20.b\n"
- "zip1 z19.b, z18.b, z16.b\n"
- "zip2 z18.b, z18.b, z16.b\n"
- "zip1 z16.b, z17.b, z10.b\n"
- "zip2 z17.b, z17.b, z10.b\n"
- "st1b { z6.b }, p2, [x24, #1, MUL VL]\n"
- "st1b { z8.b }, p2, [x24, #2, MUL VL]\n"
- "zip1 z8.b, z14.b, z2.b\n"
- "zip1 z6.b, z3.b, z27.b\n"
- "st1b { z20.b }, p2, [x24, #3, MUL VL]\n"
- "zip2 z10.b, z14.b, z2.b\n"
- "zip2 z14.b, z3.b, z27.b\n"
- "st1b { z19.b }, p2, [x24, #4, MUL VL]\n"
- "zip1 z2.b, z1.b, z4.b\n"
- "zip1 z3.b, z29.b, z24.b\n"
- "st1b { z18.b }, p2, [x24, #5, MUL VL]\n"
- "zip2 z1.b, z1.b, z4.b\n"
- "zip2 z4.b, z29.b, z24.b\n"
- "st1b { z16.b }, p2, [x24, #6, MUL VL]\n"
- "zip1 z24.b, z21.b, z13.b\n"
- "zip2 z16.b, z21.b, z13.b\n"
- "st1b { z17.b }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
- "zip1 z29.b, z12.b, z31.b\n"
- "zip2 z21.b, z12.b, z31.b\n"
- "zip1 z20.b, z15.b, z11.b\n"
- "zip2 z19.b, z15.b, z11.b\n"
- "zip1 z18.b, z23.b, z0.b\n"
- "zip2 z17.b, z23.b, z0.b\n"
- "st1b { z24.b }, p2, [x24]\n"
- "st1b { z16.b }, p2, [x24, #1, MUL VL]\n"
- "zip1 z16.b, z9.b, z30.b\n"
- "zip2 z30.b, z9.b, z30.b\n"
- "st1b { z29.b }, p2, [x24, #2, MUL VL]\n"
- "zip1 z29.b, z26.b, z28.b\n"
- "zip2 z28.b, z26.b, z28.b\n"
- "st1b { z21.b }, p2, [x24, #3, MUL VL]\n"
- "zip1 z27.b, z25.b, z22.b\n"
- "zip2 z26.b, z25.b, z22.b\n"
- "st1b { z20.b }, p2, [x24, #4, MUL VL]\n"
- "zip1 z25.b, z5.b, z7.b\n"
- "zip2 z24.b, z5.b, z7.b\n"
- "st1b { z19.b }, p2, [x24, #5, MUL VL]\n"
- "zip1 z23.b, z8.b, z6.b\n"
- "zip2 z22.b, z8.b, z6.b\n"
- "st1b { z18.b }, p2, [x24, #6, MUL VL]\n"
- "zip1 z21.b, z10.b, z14.b\n"
- "zip2 z20.b, z10.b, z14.b\n"
- "st1b { z17.b }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
- "zip1 z19.b, z2.b, z3.b\n"
- "zip2 z18.b, z2.b, z3.b\n"
- "st1b { z16.b }, p2, [x24]\n"
- "zip1 z17.b, z1.b, z4.b\n"
- "zip2 z16.b, z1.b, z4.b\n"
- "st1b { z30.b }, p2, [x24, #1, MUL VL]\n"
- "st1b { z29.b }, p2, [x24, #2, MUL VL]\n"
- "st1b { z28.b }, p2, [x24, #3, MUL VL]\n"
- "st1b { z27.b }, p2, [x24, #4, MUL VL]\n"
- "st1b { z26.b }, p2, [x24, #5, MUL VL]\n"
- "st1b { z25.b }, p2, [x24, #6, MUL VL]\n"
- "st1b { z24.b }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
- "st1b { z23.b }, p2, [x24]\n"
- "st1b { z22.b }, p2, [x24, #1, MUL VL]\n"
- "st1b { z21.b }, p2, [x24, #2, MUL VL]\n"
- "st1b { z20.b }, p2, [x24, #3, MUL VL]\n"
- "st1b { z19.b }, p2, [x24, #4, MUL VL]\n"
- "st1b { z18.b }, p2, [x24, #5, MUL VL]\n"
- "st1b { z17.b }, p2, [x24, #6, MUL VL]\n"
- "st1b { z16.b }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ "st1b { z18.b }, p2, [x21, #5, MUL VL]\n"
+ "zip2 z2.b, z0.b, z23.b\n"
+ "zip1 z1.b, z25.b, z29.b\n"
+ "st1b { z17.b }, p2, [x21, #6, MUL VL]\n"
+ "zip1 z0.b, z28.b, z21.b\n"
+ "zip2 z29.b, z25.b, z29.b\n"
+ "st1b { z16.b }, p2, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 z28.b, z28.b, z21.b\n"
+ "zip1 z17.b, z9.b, z24.b\n"
+ "zip2 z16.b, z9.b, z24.b\n"
+ "zip1 z19.b, z14.b, z11.b\n"
+ "st1b { z17.b }, p2, [x21]\n"
+ "zip2 z18.b, z14.b, z11.b\n"
+ "zip1 z17.b, z13.b, z15.b\n"
+ "st1b { z16.b }, p2, [x21, #1, MUL VL]\n"
+ "zip2 z16.b, z13.b, z15.b\n"
+ "zip1 z21.b, z31.b, z30.b\n"
+ "st1b { z19.b }, p2, [x21, #2, MUL VL]\n"
+ "zip2 z20.b, z31.b, z30.b\n"
+ "st1b { z18.b }, p2, [x21, #3, MUL VL]\n"
+ "zip1 z19.b, z27.b, z8.b\n"
+ "st1b { z17.b }, p2, [x21, #4, MUL VL]\n"
+ "zip2 z18.b, z27.b, z8.b\n"
+ "zip1 z17.b, z10.b, z26.b\n"
+ "st1b { z16.b }, p2, [x21, #5, MUL VL]\n"
+ "zip2 z16.b, z10.b, z26.b\n"
+ "zip1 z27.b, z6.b, z5.b\n"
+ "st1b { z21.b }, p2, [x21, #6, MUL VL]\n"
+ "zip2 z26.b, z6.b, z5.b\n"
+ "zip1 z25.b, z12.b, z7.b\n"
+ "st1b { z20.b }, p2, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "zip2 z24.b, z12.b, z7.b\n"
+ "zip1 z23.b, z4.b, z22.b\n"
+ "st1b { z19.b }, p2, [x21]\n"
+ "zip2 z22.b, z4.b, z22.b\n"
+ "zip1 z21.b, z3.b, z2.b\n"
+ "st1b { z18.b }, p2, [x21, #1, MUL VL]\n"
+ "zip2 z20.b, z3.b, z2.b\n"
+ "zip1 z19.b, z1.b, z0.b\n"
+ "st1b { z17.b }, p2, [x21, #2, MUL VL]\n"
+ "zip2 z18.b, z1.b, z0.b\n"
+ "zip1 z17.b, z29.b, z28.b\n"
+ "st1b { z16.b }, p2, [x21, #3, MUL VL]\n"
+ "zip2 z16.b, z29.b, z28.b\n"
+ "st1b { z27.b }, p2, [x21, #4, MUL VL]\n"
+ "st1b { z26.b }, p2, [x21, #5, MUL VL]\n"
+ "st1b { z25.b }, p2, [x21, #6, MUL VL]\n"
+ "st1b { z24.b }, p2, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
+ "st1b { z23.b }, p2, [x21]\n"
+ "st1b { z22.b }, p2, [x21, #1, MUL VL]\n"
+ "st1b { z21.b }, p2, [x21, #2, MUL VL]\n"
+ "st1b { z20.b }, p2, [x21, #3, MUL VL]\n"
+ "st1b { z19.b }, p2, [x21, #4, MUL VL]\n"
+ "st1b { z18.b }, p2, [x21, #5, MUL VL]\n"
+ "st1b { z17.b }, p2, [x21, #6, MUL VL]\n"
+ "st1b { z16.b }, p2, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "mov x20, x25\n"
- "decw x25, ALL, MUL #8\n"
+ "mov x20, x23\n"
"whilelt p1.b, XZR, x20\n"
+ "ld1b { z23.b }, p1/Z, [x26]\n"
+ "ld1b { z22.b }, p1/Z, [x25]\n"
"decb x20\n"
"whilelt p0.b, XZR, x20\n"
- "cmp x25, #0x0\n"
- "ld1b { z20.b }, p1/Z, [x26]\n"
- "ld1b { z19.b }, p1/Z, [x23]\n"
+ "ld1b { z21.b }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z25.b }, p0/Z, [x25, #1, MUL VL]\n"
+ "ld1b { z19.b }, p1/Z, [x24]\n"
+ "ld1b { z20.b }, p0/Z, [x24, #1, MUL VL]\n"
+ "decw x23, ALL, MUL #8\n"
+ "zip1 z24.b, z23.b, z19.b\n"
"ld1b { z18.b }, p1/Z, [x22]\n"
- "ld1b { z17.b }, p1/Z, [x21]\n"
- "ld1b { z24.b }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z16.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "zip1 z17.b, z22.b, z18.b\n"
+ "zip2 z23.b, z23.b, z19.b\n"
+ "zip2 z19.b, z22.b, z18.b\n"
+ "zip1 z22.b, z21.b, z20.b\n"
+ "cmp x23, #0x0\n"
"addvl x26, x26, #2\n"
- "ld1b { z25.b }, p0/Z, [x23, #1, MUL VL]\n"
- "addvl x23, x23, #2\n"
- "ld1b { z23.b }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1b { z16.b }, p0/Z, [x21, #1, MUL VL]\n"
+ "zip1 z18.b, z25.b, z16.b\n"
+ "zip2 z21.b, z21.b, z20.b\n"
+ "addvl x25, x25, #2\n"
+ "addvl x24, x24, #2\n"
+ "zip2 z20.b, z25.b, z16.b\n"
"addvl x22, x22, #2\n"
- "addvl x21, x21, #2\n"
- "zip1 z22.b, z20.b, z18.b\n"
- "zip1 z21.b, z19.b, z17.b\n"
- "zip2 z20.b, z20.b, z18.b\n"
- "zip2 z19.b, z19.b, z17.b\n"
- "zip1 z18.b, z24.b, z23.b\n"
- "zip1 z17.b, z25.b, z16.b\n"
- "zip2 z24.b, z24.b, z23.b\n"
- "zip2 z16.b, z25.b, z16.b\n"
- "zip1 z23.b, z22.b, z21.b\n"
- "zip2 z22.b, z22.b, z21.b\n"
- "zip1 z21.b, z20.b, z19.b\n"
- "zip2 z20.b, z20.b, z19.b\n"
- "zip1 z19.b, z18.b, z17.b\n"
- "zip2 z18.b, z18.b, z17.b\n"
- "zip1 z17.b, z24.b, z16.b\n"
- "zip2 z16.b, z24.b, z16.b\n"
- "st1b { z23.b }, p2, [x24]\n"
- "st1b { z22.b }, p2, [x24, #1, MUL VL]\n"
- "st1b { z21.b }, p2, [x24, #2, MUL VL]\n"
- "st1b { z20.b }, p2, [x24, #3, MUL VL]\n"
- "st1b { z19.b }, p2, [x24, #4, MUL VL]\n"
- "st1b { z18.b }, p2, [x24, #5, MUL VL]\n"
- "st1b { z17.b }, p2, [x24, #6, MUL VL]\n"
- "st1b { z16.b }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ "zip1 z16.b, z24.b, z17.b\n"
+ "st1b { z16.b }, p2, [x21]\n"
+ "zip2 z16.b, z24.b, z17.b\n"
+ "zip1 z17.b, z23.b, z19.b\n"
+ "st1b { z16.b }, p2, [x21, #1, MUL VL]\n"
+ "zip2 z16.b, z23.b, z19.b\n"
+ "zip1 z19.b, z22.b, z18.b\n"
+ "st1b { z17.b }, p2, [x21, #2, MUL VL]\n"
+ "zip2 z18.b, z22.b, z18.b\n"
+ "zip1 z17.b, z21.b, z20.b\n"
+ "st1b { z16.b }, p2, [x21, #3, MUL VL]\n"
+ "zip2 z16.b, z21.b, z20.b\n"
+ "st1b { z19.b }, p2, [x21, #4, MUL VL]\n"
+ "st1b { z18.b }, p2, [x21, #5, MUL VL]\n"
+ "st1b { z17.b }, p2, [x21, #6, MUL VL]\n"
+ "st1b { z16.b }, p2, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
index 091c0e526b..3e20a5882e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
@@ -42,180 +42,180 @@ void sve_transpose_interleave_8VL_1x8(uint8_t *out, const uint8_t *in, size_t wi
"ptrue p1.b\n"
"1:" // Main row loop: Head
"mov x10, %x[in]\n"
- "cmp %x[height], #0x7\n"
- "mov x9, %x[width]\n"
- "cntb x28, ALL, MUL #2\n"
- "mov x27, %x[out]\n"
- "add x26, x10, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "add x27, x28, %x[in_stride]\n"
+ "add x26, x27, %x[in_stride]\n"
"add x25, x26, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add x20, x21, %x[in_stride]\n"
- "add %x[in], x20, %x[in_stride]\n"
- "csel x20, x20, %x[pad_row], GT\n"
- "csel x21, x21, %x[pad_row], GE\n"
+ "cmp %x[height], #0x7\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x5\n"
- "csel x22, x22, %x[pad_row], GT\n"
- "csel x23, x23, %x[pad_row], GE\n"
+ "mov x22, %x[width]\n"
+ "cntb x21, ALL, MUL #2\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "csel x26, x26, %x[pad_row], GE\n"
"cmp %x[height], #0x3\n"
- "csel x24, x24, %x[pad_row], GT\n"
- "csel x25, x25, %x[pad_row], GE\n"
+ "csel x27, x27, %x[pad_row], GT\n"
+ "csel x28, x28, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x26, x26, %x[pad_row], GT\n"
- "cmp x9, x28\n"
+ "csel x9, x9, %x[pad_row], GT\n"
+ "cmp x22, x21\n"
+ "mov x20, %x[out]\n"
"sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1b { z24.b }, p1/Z, [x10]\n"
- "ld1b { z0.b }, p1/Z, [x26]\n"
- "sub x9, x9, x28\n"
- "ld1b { z31.b }, p1/Z, [x25]\n"
- "ld1b { z30.b }, p1/Z, [x24]\n"
- "cmp x9, x28\n"
- "ld1b { z23.b }, p1/Z, [x23]\n"
- "ld1b { z29.b }, p1/Z, [x22]\n"
- "ld1b { z22.b }, p1/Z, [x21]\n"
- "ld1b { z21.b }, p1/Z, [x20]\n"
- "ld1b { z28.b }, p1/Z, [x10, #1, MUL VL]\n"
- "ld1b { z4.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z23.b }, p1/Z, [x10]\n"
+ "ld1b { z22.b }, p1/Z, [x9]\n"
+ "sub x22, x22, x21\n"
+ "cmp x22, x21\n"
+ "ld1b { z20.b }, p1/Z, [x28]\n"
+ "ld1b { z21.b }, p1/Z, [x27]\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
+ "zip1 z5.b, z23.b, z19.b\n"
+ "zip1 z4.b, z22.b, z18.b\n"
+ "ld1b { z17.b }, p1/Z, [x24]\n"
+ "ld1b { z16.b }, p1/Z, [x23]\n"
+ "zip1 z3.b, z20.b, z17.b\n"
+ "zip1 z31.b, z21.b, z16.b\n"
+ "ld1b { z25.b }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1b { z24.b }, p1/Z, [x9, #1, MUL VL]\n"
+ "zip2 z2.b, z23.b, z19.b\n"
+ "zip2 z30.b, z20.b, z17.b\n"
+ "ld1b { z23.b }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x27, #1, MUL VL]\n"
+ "zip2 z22.b, z22.b, z18.b\n"
+ "zip2 z21.b, z21.b, z16.b\n"
+ "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x25, #1, MUL VL]\n"
+ "zip1 z29.b, z25.b, z19.b\n"
+ "zip1 z28.b, z24.b, z18.b\n"
+ "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n"
+ "ld1b { z16.b }, p1/Z, [x23, #1, MUL VL]\n"
+ "zip1 z27.b, z23.b, z17.b\n"
+ "zip1 z26.b, z20.b, z16.b\n"
+ "zip2 z1.b, z25.b, z19.b\n"
+ "zip2 z25.b, z23.b, z17.b\n"
"addvl x10, x10, #2\n"
+ "addvl x9, x9, #2\n"
+ "zip2 z24.b, z24.b, z18.b\n"
+ "zip2 z16.b, z20.b, z16.b\n"
+ "addvl x28, x28, #2\n"
+ "addvl x27, x27, #2\n"
+ "zip1 z0.b, z5.b, z3.b\n"
+ "zip1 z17.b, z4.b, z31.b\n"
"addvl x26, x26, #2\n"
- "ld1b { z27.b }, p1/Z, [x25, #1, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [x24, #1, MUL VL]\n"
- "zip1 z3.b, z24.b, z23.b\n"
- "zip1 z2.b, z0.b, z29.b\n"
- "ld1b { z19.b }, p1/Z, [x23, #1, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [x22, #1, MUL VL]\n"
- "zip1 z26.b, z31.b, z22.b\n"
- "zip1 z25.b, z30.b, z21.b\n"
- "ld1b { z17.b }, p1/Z, [x21, #1, MUL VL]\n"
- "ld1b { z16.b }, p1/Z, [x20, #1, MUL VL]\n"
- "zip2 z24.b, z24.b, z23.b\n"
- "zip2 z23.b, z31.b, z22.b\n"
- "zip2 z22.b, z0.b, z29.b\n"
- "zip2 z21.b, z30.b, z21.b\n"
"addvl x25, x25, #2\n"
+ "zip2 z20.b, z5.b, z3.b\n"
+ "zip2 z19.b, z4.b, z31.b\n"
"addvl x24, x24, #2\n"
- "zip1 z0.b, z28.b, z19.b\n"
- "zip1 z31.b, z4.b, z18.b\n"
"addvl x23, x23, #2\n"
- "addvl x22, x22, #2\n"
- "zip1 z30.b, z27.b, z17.b\n"
- "zip1 z29.b, z20.b, z16.b\n"
- "addvl x21, x21, #2\n"
- "addvl x20, x20, #2\n"
- "zip2 z1.b, z28.b, z19.b\n"
- "zip2 z28.b, z27.b, z17.b\n"
- "zip2 z27.b, z4.b, z18.b\n"
- "zip2 z20.b, z20.b, z16.b\n"
- "zip1 z19.b, z3.b, z26.b\n"
- "zip1 z18.b, z2.b, z25.b\n"
- "zip2 z17.b, z3.b, z26.b\n"
- "zip2 z16.b, z2.b, z25.b\n"
- "zip1 z26.b, z24.b, z23.b\n"
- "zip1 z25.b, z22.b, z21.b\n"
- "zip2 z24.b, z24.b, z23.b\n"
+ "zip1 z31.b, z2.b, z30.b\n"
+ "zip1 z18.b, z22.b, z21.b\n"
+ "zip2 z30.b, z2.b, z30.b\n"
"zip2 z23.b, z22.b, z21.b\n"
- "zip1 z22.b, z0.b, z30.b\n"
- "zip1 z21.b, z31.b, z29.b\n"
- "zip2 z0.b, z0.b, z30.b\n"
- "zip2 z31.b, z31.b, z29.b\n"
- "zip1 z30.b, z1.b, z28.b\n"
- "zip1 z29.b, z27.b, z20.b\n"
- "zip2 z28.b, z1.b, z28.b\n"
- "zip2 z27.b, z27.b, z20.b\n"
- "zip1 z20.b, z19.b, z18.b\n"
- "zip2 z19.b, z19.b, z18.b\n"
- "zip1 z18.b, z17.b, z16.b\n"
- "zip2 z17.b, z17.b, z16.b\n"
- "zip1 z16.b, z26.b, z25.b\n"
- "zip2 z26.b, z26.b, z25.b\n"
- "zip1 z25.b, z24.b, z23.b\n"
- "zip2 z24.b, z24.b, z23.b\n"
- "st1b { z20.b }, p1, [x27]\n"
- "st1b { z19.b }, p1, [x27, #1, MUL VL]\n"
+ "zip1 z22.b, z29.b, z27.b\n"
+ "zip1 z21.b, z28.b, z26.b\n"
+ "zip2 z29.b, z29.b, z27.b\n"
+ "zip2 z28.b, z28.b, z26.b\n"
+ "zip1 z27.b, z1.b, z25.b\n"
+ "zip1 z26.b, z24.b, z16.b\n"
+ "zip2 z25.b, z1.b, z25.b\n"
+ "zip2 z24.b, z24.b, z16.b\n"
+ "zip1 z16.b, z0.b, z17.b\n"
+ "zip2 z17.b, z0.b, z17.b\n"
+ "st1b { z16.b }, p1, [x20]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
+ "zip2 z20.b, z20.b, z19.b\n"
+ "st1b { z17.b }, p1, [x20, #1, MUL VL]\n"
+ "zip1 z19.b, z31.b, z18.b\n"
+ "zip2 z18.b, z31.b, z18.b\n"
+ "st1b { z16.b }, p1, [x20, #2, MUL VL]\n"
+ "zip1 z17.b, z30.b, z23.b\n"
+ "zip2 z16.b, z30.b, z23.b\n"
+ "st1b { z20.b }, p1, [x20, #3, MUL VL]\n"
+ "st1b { z19.b }, p1, [x20, #4, MUL VL]\n"
"zip1 z23.b, z22.b, z21.b\n"
"zip2 z22.b, z22.b, z21.b\n"
- "st1b { z18.b }, p1, [x27, #2, MUL VL]\n"
- "zip1 z21.b, z0.b, z31.b\n"
- "zip2 z20.b, z0.b, z31.b\n"
- "st1b { z17.b }, p1, [x27, #3, MUL VL]\n"
- "zip1 z19.b, z30.b, z29.b\n"
- "zip2 z18.b, z30.b, z29.b\n"
- "st1b { z16.b }, p1, [x27, #4, MUL VL]\n"
- "zip1 z17.b, z28.b, z27.b\n"
- "zip2 z16.b, z28.b, z27.b\n"
- "st1b { z26.b }, p1, [x27, #5, MUL VL]\n"
- "st1b { z25.b }, p1, [x27, #6, MUL VL]\n"
- "st1b { z24.b }, p1, [x27, #7, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
- "st1b { z23.b }, p1, [x27]\n"
- "st1b { z22.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z21.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #5, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #6, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #7, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z18.b }, p1, [x20, #5, MUL VL]\n"
+ "zip1 z21.b, z29.b, z28.b\n"
+ "zip2 z20.b, z29.b, z28.b\n"
+ "st1b { z17.b }, p1, [x20, #6, MUL VL]\n"
+ "zip1 z19.b, z27.b, z26.b\n"
+ "zip2 z18.b, z27.b, z26.b\n"
+ "st1b { z16.b }, p1, [x20, #7, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
+ "zip1 z17.b, z25.b, z24.b\n"
+ "zip2 z16.b, z25.b, z24.b\n"
+ "st1b { z23.b }, p1, [x20]\n"
+ "st1b { z22.b }, p1, [x20, #1, MUL VL]\n"
+ "st1b { z21.b }, p1, [x20, #2, MUL VL]\n"
+ "st1b { z20.b }, p1, [x20, #3, MUL VL]\n"
+ "st1b { z19.b }, p1, [x20, #4, MUL VL]\n"
+ "st1b { z18.b }, p1, [x20, #5, MUL VL]\n"
+ "st1b { z17.b }, p1, [x20, #6, MUL VL]\n"
+ "st1b { z16.b }, p1, [x20, #7, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x9, 5f\n"
+ "cbz x22, 5f\n"
"4:" // Main row loop: Column loop
- "whilelt p0.b, XZR, x9\n"
- "decd x9, ALL, MUL #8\n"
- "ld1b { z19.b }, p0/Z, [x10]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1b { z25.b }, p0/Z, [x10]\n"
+ "ld1b { z27.b }, p0/Z, [x9]\n"
+ "decd x22, ALL, MUL #8\n"
+ "ld1b { z26.b }, p0/Z, [x28]\n"
+ "ld1b { z24.b }, p0/Z, [x27]\n"
+ "cmp x22, #0x0\n"
"addvl x10, x10, #1\n"
- "ld1b { z26.b }, p0/Z, [x26]\n"
+ "ld1b { z22.b }, p0/Z, [x26]\n"
+ "ld1b { z21.b }, p0/Z, [x25]\n"
+ "zip1 z20.b, z25.b, z22.b\n"
+ "zip1 z23.b, z27.b, z21.b\n"
+ "ld1b { z17.b }, p0/Z, [x24]\n"
+ "ld1b { z16.b }, p0/Z, [x23]\n"
+ "zip1 z19.b, z26.b, z17.b\n"
+ "zip1 z18.b, z24.b, z16.b\n"
+ "zip2 z25.b, z25.b, z22.b\n"
+ "zip2 z22.b, z26.b, z17.b\n"
+ "addvl x9, x9, #1\n"
+ "addvl x28, x28, #1\n"
+ "zip2 z21.b, z27.b, z21.b\n"
+ "zip2 z16.b, z24.b, z16.b\n"
+ "addvl x27, x27, #1\n"
"addvl x26, x26, #1\n"
- "ld1b { z22.b }, p0/Z, [x25]\n"
+ "zip1 z24.b, z20.b, z19.b\n"
+ "zip1 z17.b, z23.b, z18.b\n"
"addvl x25, x25, #1\n"
- "ld1b { z25.b }, p0/Z, [x24]\n"
"addvl x24, x24, #1\n"
- "ld1b { z18.b }, p0/Z, [x23]\n"
- "ld1b { z21.b }, p0/Z, [x22]\n"
- "ld1b { z17.b }, p0/Z, [x21]\n"
- "cmp x9, #0x0\n"
+ "zip2 z20.b, z20.b, z19.b\n"
+ "zip2 z19.b, z23.b, z18.b\n"
"addvl x23, x23, #1\n"
- "ld1b { z16.b }, p0/Z, [x20]\n"
- "addvl x22, x22, #1\n"
- "addvl x21, x21, #1\n"
- "zip1 z20.b, z19.b, z18.b\n"
- "zip2 z24.b, z19.b, z18.b\n"
- "addvl x20, x20, #1\n"
- "zip1 z19.b, z22.b, z17.b\n"
- "zip1 z18.b, z26.b, z21.b\n"
- "zip2 z23.b, z22.b, z17.b\n"
- "zip1 z17.b, z25.b, z16.b\n"
- "zip2 z22.b, z26.b, z21.b\n"
- "zip2 z16.b, z25.b, z16.b\n"
- "zip1 z21.b, z20.b, z19.b\n"
+ "zip1 z23.b, z25.b, z22.b\n"
+ "zip1 z18.b, z21.b, z16.b\n"
+ "zip2 z22.b, z25.b, z22.b\n"
+ "zip2 z21.b, z21.b, z16.b\n"
+ "zip1 z16.b, z24.b, z17.b\n"
+ "zip2 z17.b, z24.b, z17.b\n"
+ "st1b { z16.b }, p1, [x20]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
"zip2 z20.b, z20.b, z19.b\n"
- "zip1 z25.b, z24.b, z23.b\n"
- "zip1 z19.b, z18.b, z17.b\n"
- "zip2 z18.b, z18.b, z17.b\n"
- "zip1 z17.b, z22.b, z16.b\n"
- "zip2 z24.b, z24.b, z23.b\n"
- "zip2 z23.b, z22.b, z16.b\n"
- "zip1 z16.b, z21.b, z19.b\n"
- "zip2 z22.b, z21.b, z19.b\n"
- "zip1 z21.b, z20.b, z18.b\n"
- "zip2 z20.b, z20.b, z18.b\n"
- "zip1 z19.b, z25.b, z17.b\n"
- "zip2 z18.b, z25.b, z17.b\n"
- "zip1 z17.b, z24.b, z23.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "zip2 z16.b, z24.b, z23.b\n"
- "st1b { z22.b }, p1, [x27, #1, MUL VL]\n"
- "st1b { z21.b }, p1, [x27, #2, MUL VL]\n"
- "st1b { z20.b }, p1, [x27, #3, MUL VL]\n"
- "st1b { z19.b }, p1, [x27, #4, MUL VL]\n"
- "st1b { z18.b }, p1, [x27, #5, MUL VL]\n"
- "st1b { z17.b }, p1, [x27, #6, MUL VL]\n"
- "st1b { z16.b }, p1, [x27, #7, MUL VL]\n"
- "add x27, x27, %x[out_stride]\n"
+ "st1b { z17.b }, p1, [x20, #1, MUL VL]\n"
+ "zip1 z19.b, z23.b, z18.b\n"
+ "zip2 z18.b, z23.b, z18.b\n"
+ "st1b { z16.b }, p1, [x20, #2, MUL VL]\n"
+ "zip1 z17.b, z22.b, z21.b\n"
+ "zip2 z16.b, z22.b, z21.b\n"
+ "st1b { z20.b }, p1, [x20, #3, MUL VL]\n"
+ "st1b { z19.b }, p1, [x20, #4, MUL VL]\n"
+ "st1b { z18.b }, p1, [x20, #5, MUL VL]\n"
+ "st1b { z17.b }, p1, [x20, #6, MUL VL]\n"
+ "st1b { z16.b }, p1, [x20, #7, MUL VL]\n"
+ "add x20, x20, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -223,7 +223,7 @@ void sve_transpose_interleave_8VL_1x8(uint8_t *out, const uint8_t *in, size_t wi
"bge 1b\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
index 6d436ebcad..eb7312b2e3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
@@ -40,198 +40,198 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t
__asm__ __volatile__(
"cmp %x[height], #0x4\n"
- "ptrue p2.b\n"
+ "ptrue p4.b\n"
"blt 6f\n"
"1:" // Main row loop: Head
"mov x28, %x[in]\n"
"mov x27, %x[width]\n"
"cnth x26, ALL, MUL #8\n"
- "mov x25, %x[out]\n"
- "sub %x[height], %x[height], #0x4\n"
- "add x24, x28, %x[in_stride]\n"
+ "add x25, x28, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
"cmp x27, x26\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z17.h }, p2/Z, [x28]\n"
- "ld1h { z30.h }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x21, x25\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1h { z28.h }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x28, #3, MUL VL]\n"
- "mov x20, x25\n"
+ "ld1h { z30.h }, p4/Z, [x28]\n"
+ "ld1h { z12.h }, p4/Z, [x28, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1h { z31.h }, p4/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z18.h }, p4/Z, [x28, #3, MUL VL]\n"
+ "mov x20, x22\n"
"sub x27, x27, x26\n"
- "ld1h { z16.h }, p2/Z, [x24]\n"
- "ld1h { z25.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z17.h }, p4/Z, [x25, #1, MUL VL]\n"
+ "zip1 z3.h, z30.h, z20.h\n"
+ "zip2 z21.h, z30.h, z20.h\n"
+ "ld1h { z26.h }, p4/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p4/Z, [x25, #3, MUL VL]\n"
+ "zip1 z13.h, z12.h, z17.h\n"
+ "zip2 z0.h, z12.h, z17.h\n"
+ "ld1h { z2.h }, p4/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x28, #5, MUL VL]\n"
+ "zip1 z12.h, z31.h, z26.h\n"
+ "zip2 z14.h, z31.h, z26.h\n"
+ "ld1h { z17.h }, p4/Z, [x28, #6, MUL VL]\n"
+ "ld1h { z29.h }, p4/Z, [x28, #7, MUL VL]\n"
+ "zip1 z16.h, z18.h, z23.h\n"
+ "zip2 z15.h, z18.h, z23.h\n"
+ "ld1h { z9.h }, p4/Z, [x25, #4, MUL VL]\n"
+ "ld1h { z18.h }, p4/Z, [x25, #5, MUL VL]\n"
+ "zip1 z11.h, z2.h, z9.h\n"
+ "zip2 z5.h, z2.h, z9.h\n"
+ "ld1h { z7.h }, p4/Z, [x25, #6, MUL VL]\n"
+ "ld1h { z2.h }, p4/Z, [x25, #7, MUL VL]\n"
+ "zip1 z10.h, z24.h, z18.h\n"
+ "zip2 z6.h, z24.h, z18.h\n"
+ "ld1h { z19.h }, p4/Z, [x24]\n"
+ "ld1h { z18.h }, p4/Z, [x24, #1, MUL VL]\n"
+ "zip1 z9.h, z17.h, z7.h\n"
+ "zip2 z4.h, z17.h, z7.h\n"
+ "ld1h { z24.h }, p4/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z22.h }, p4/Z, [x24, #3, MUL VL]\n"
+ "zip1 z7.h, z29.h, z2.h\n"
+ "zip2 z8.h, z29.h, z2.h\n"
+ "ld1h { z25.h }, p4/Z, [x24, #4, MUL VL]\n"
+ "ld1h { z17.h }, p4/Z, [x24, #5, MUL VL]\n"
"cmp x27, x26\n"
- "add x25, x25, %x[out_stride]\n"
- "ld1h { z24.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x24, #3, MUL VL]\n"
- "ld1h { z21.h }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1h { z20.h }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1h { z23.h }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1h { z19.h }, p2/Z, [x28, #7, MUL VL]\n"
- "zip1 z31.h, z17.h, z16.h\n"
- "zip2 z29.h, z17.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x24, #4, MUL VL]\n"
- "ld1h { z17.h }, p2/Z, [x24, #5, MUL VL]\n"
- "zip1 z27.h, z30.h, z25.h\n"
- "zip2 z26.h, z30.h, z25.h\n"
- "ld1h { z16.h }, p2/Z, [x24, #6, MUL VL]\n"
- "ld1h { z0.h }, p2/Z, [x24, #7, MUL VL]\n"
- "zip1 z14.h, z28.h, z24.h\n"
- "zip2 z15.h, z28.h, z24.h\n"
- "ld1h { z30.h }, p2/Z, [x23]\n"
- "ld1h { z28.h }, p2/Z, [x23, #1, MUL VL]\n"
- "zip1 z13.h, z1.h, z22.h\n"
- "zip2 z12.h, z1.h, z22.h\n"
- "ld1h { z25.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z24.h }, p2/Z, [x23, #3, MUL VL]\n"
- "zip1 z11.h, z21.h, z18.h\n"
- "zip2 z10.h, z21.h, z18.h\n"
- "ld1h { z9.h }, p2/Z, [x23, #4, MUL VL]\n"
- "ld1h { z8.h }, p2/Z, [x23, #5, MUL VL]\n"
- "zip1 z7.h, z20.h, z17.h\n"
- "zip2 z6.h, z20.h, z17.h\n"
- "ld1h { z5.h }, p2/Z, [x23, #6, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x23, #7, MUL VL]\n"
- "zip1 z3.h, z23.h, z16.h\n"
- "zip2 z2.h, z23.h, z16.h\n"
- "ld1h { z23.h }, p2/Z, [x22]\n"
- "ld1h { z22.h }, p2/Z, [x22, #1, MUL VL]\n"
- "zip1 z1.h, z19.h, z0.h\n"
- "zip2 z0.h, z19.h, z0.h\n"
- "ld1h { z21.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z20.h }, p2/Z, [x22, #3, MUL VL]\n"
"addvl x28, x28, #8\n"
+ "ld1h { z2.h }, p4/Z, [x24, #6, MUL VL]\n"
+ "ld1h { z30.h }, p4/Z, [x24, #7, MUL VL]\n"
+ "addvl x25, x25, #8\n"
"addvl x24, x24, #8\n"
- "ld1h { z19.h }, p2/Z, [x22, #4, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x22, #5, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x23]\n"
+ "ld1h { z27.h }, p4/Z, [x23, #1, MUL VL]\n"
+ "zip1 z31.h, z19.h, z20.h\n"
+ "zip2 z29.h, z19.h, z20.h\n"
+ "ld1h { z26.h }, p4/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z23.h }, p4/Z, [x23, #3, MUL VL]\n"
+ "zip1 z28.h, z18.h, z27.h\n"
+ "zip2 z1.h, z18.h, z27.h\n"
+ "ld1h { z20.h }, p4/Z, [x23, #4, MUL VL]\n"
+ "ld1h { z19.h }, p4/Z, [x23, #5, MUL VL]\n"
+ "zip1 z27.h, z24.h, z26.h\n"
+ "zip2 z26.h, z24.h, z26.h\n"
+ "ld1h { z18.h }, p4/Z, [x23, #6, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x23, #7, MUL VL]\n"
+ "st1h { z3.h }, p4, [x21]\n"
+ "zip1 z3.h, z22.h, z23.h\n"
+ "st1h { z21.h }, p4, [x21, #1, MUL VL]\n"
+ "zip2 z22.h, z22.h, z23.h\n"
"addvl x23, x23, #8\n"
- "ld1h { z17.h }, p2/Z, [x22, #6, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x22, #7, MUL VL]\n"
- "st1h { z31.h }, p2, [x21]\n"
- "zip1 z31.h, z30.h, z23.h\n"
- "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
- "zip2 z30.h, z30.h, z23.h\n"
- "zip1 z29.h, z28.h, z22.h\n"
- "addvl x22, x22, #8\n"
- "st1h { z27.h }, p2, [x21, #2, MUL VL]\n"
- "zip2 z28.h, z28.h, z22.h\n"
- "zip1 z27.h, z25.h, z21.h\n"
- "st1h { z26.h }, p2, [x21, #3, MUL VL]\n"
- "zip2 z26.h, z25.h, z21.h\n"
- "zip1 z25.h, z24.h, z20.h\n"
- "st1h { z14.h }, p2, [x21, #4, MUL VL]\n"
- "zip2 z24.h, z24.h, z20.h\n"
- "zip1 z23.h, z9.h, z19.h\n"
- "st1h { z15.h }, p2, [x21, #5, MUL VL]\n"
- "zip2 z22.h, z9.h, z19.h\n"
- "zip1 z21.h, z8.h, z18.h\n"
- "st1h { z13.h }, p2, [x21, #6, MUL VL]\n"
- "zip2 z20.h, z8.h, z18.h\n"
- "zip1 z19.h, z5.h, z17.h\n"
- "st1h { z12.h }, p2, [x21, #7, MUL VL]\n"
+ "zip1 z23.h, z25.h, z20.h\n"
+ "st1h { z13.h }, p4, [x21, #2, MUL VL]\n"
+ "zip2 z25.h, z25.h, z20.h\n"
+ "zip1 z21.h, z17.h, z19.h\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z0.h }, p4, [x21, #3, MUL VL]\n"
+ "zip2 z20.h, z17.h, z19.h\n"
+ "zip1 z19.h, z2.h, z18.h\n"
+ "st1h { z12.h }, p4, [x21, #4, MUL VL]\n"
+ "zip2 z18.h, z2.h, z18.h\n"
+ "zip1 z17.h, z30.h, z24.h\n"
+ "st1h { z14.h }, p4, [x21, #5, MUL VL]\n"
+ "zip2 z13.h, z30.h, z24.h\n"
+ "st1h { z16.h }, p4, [x21, #6, MUL VL]\n"
+ "st1h { z15.h }, p4, [x21, #7, MUL VL]\n"
"addvl x21, x21, #16\n"
- "zip2 z18.h, z5.h, z17.h\n"
- "zip1 z17.h, z4.h, z16.h\n"
- "zip2 z16.h, z4.h, z16.h\n"
- "st1h { z31.h }, p2, [x21, #-8, MUL VL]\n"
- "st1h { z30.h }, p2, [x21, #-7, MUL VL]\n"
- "st1h { z29.h }, p2, [x21, #-6, MUL VL]\n"
- "st1h { z28.h }, p2, [x21, #-5, MUL VL]\n"
- "st1h { z27.h }, p2, [x21, #-4, MUL VL]\n"
- "st1h { z26.h }, p2, [x21, #-3, MUL VL]\n"
- "st1h { z25.h }, p2, [x21, #-2, MUL VL]\n"
- "st1h { z24.h }, p2, [x21, #-1, MUL VL]\n"
- "st1h { z11.h }, p2, [x20]\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z7.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z6.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z3.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z2.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z1.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z0.h }, p2, [x20, #7, MUL VL]\n"
+ "st1h { z31.h }, p4, [x21, #-8, MUL VL]\n"
+ "st1h { z29.h }, p4, [x21, #-7, MUL VL]\n"
+ "st1h { z28.h }, p4, [x21, #-6, MUL VL]\n"
+ "st1h { z1.h }, p4, [x21, #-5, MUL VL]\n"
+ "st1h { z27.h }, p4, [x21, #-4, MUL VL]\n"
+ "st1h { z26.h }, p4, [x21, #-3, MUL VL]\n"
+ "st1h { z3.h }, p4, [x21, #-2, MUL VL]\n"
+ "st1h { z22.h }, p4, [x21, #-1, MUL VL]\n"
+ "st1h { z11.h }, p4, [x20]\n"
+ "st1h { z5.h }, p4, [x20, #1, MUL VL]\n"
+ "st1h { z10.h }, p4, [x20, #2, MUL VL]\n"
+ "st1h { z6.h }, p4, [x20, #3, MUL VL]\n"
+ "st1h { z9.h }, p4, [x20, #4, MUL VL]\n"
+ "st1h { z4.h }, p4, [x20, #5, MUL VL]\n"
+ "st1h { z7.h }, p4, [x20, #6, MUL VL]\n"
+ "st1h { z8.h }, p4, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
- "st1h { z22.h }, p2, [x20, #-7, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
- "st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
- "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
- "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
- "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
- "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+ "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+ "st1h { z25.h }, p4, [x20, #-7, MUL VL]\n"
+ "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+ "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+ "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+ "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+ "st1h { z13.h }, p4, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
"cbz x27, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x27\n"
- "mov x20, x25\n"
+ "mov x20, x27\n"
+ "whilelt p3.h, XZR, x20\n"
+ "ld1h { z20.h }, p3/Z, [x28]\n"
+ "ld1h { z19.h }, p3/Z, [x25]\n"
+ "dech x20\n"
+ "whilelt p2.h, XZR, x20\n"
+ "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z25.h }, p1/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x25, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z0.h }, p0/Z, [x28, #3, MUL VL]\n"
+ "ld1h { z24.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "mov x20, x22\n"
"decw x27, ALL, MUL #8\n"
- "add x25, x25, %x[out_stride]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p0.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z20.h }, p1/Z, [x28]\n"
- "ld1h { z16.h }, p1/Z, [x24]\n"
- "ld1h { z23.h }, p1/Z, [x23]\n"
- "ld1h { z19.h }, p1/Z, [x22]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "ld1h { z18.h }, p0/Z, [x28, #1, MUL VL]\n"
- "ld1h { z17.h }, p0/Z, [x24, #1, MUL VL]\n"
- "ld1h { z1.h }, p0/Z, [x23, #1, MUL VL]\n"
- "ld1h { z0.h }, p0/Z, [x22, #1, MUL VL]\n"
- "zip1 z22.h, z20.h, z16.h\n"
- "zip2 z21.h, z20.h, z16.h\n"
- "whilelt p0.h, XZR, x21\n"
- "ld1h { z20.h }, p1/Z, [x28, #2, MUL VL]\n"
- "ld1h { z16.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z31.h }, p3/Z, [x24]\n"
+ "ld1h { z30.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z29.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z28.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "zip1 z23.h, z20.h, z19.h\n"
+ "zip2 z22.h, z20.h, z19.h\n"
+ "ld1h { z21.h }, p3/Z, [x23]\n"
+ "ld1h { z27.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "zip1 z20.h, z18.h, z17.h\n"
+ "zip2 z19.h, z18.h, z17.h\n"
+ "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z26.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "zip1 z17.h, z25.h, z16.h\n"
+ "zip2 z16.h, z25.h, z16.h\n"
+ "zip1 z25.h, z0.h, z24.h\n"
+ "zip2 z24.h, z0.h, z24.h\n"
+ "st1h { z23.h }, p4, [x20]\n"
"cmp x27, #0x0\n"
- "ld1h { z31.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z30.h }, p1/Z, [x22, #2, MUL VL]\n"
- "zip1 z29.h, z18.h, z17.h\n"
- "zip2 z28.h, z18.h, z17.h\n"
- "zip1 z27.h, z23.h, z19.h\n"
- "zip2 z26.h, z23.h, z19.h\n"
- "ld1h { z19.h }, p0/Z, [x28, #3, MUL VL]\n"
+ "st1h { z22.h }, p4, [x20, #1, MUL VL]\n"
"addvl x28, x28, #4\n"
- "ld1h { z18.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z25.h }, p0/Z, [x23, #3, MUL VL]\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z24.h, z20.h, z16.h\n"
- "ld1h { z16.h }, p0/Z, [x22, #3, MUL VL]\n"
- "st1h { z22.h }, p2, [x20]\n"
+ "addvl x25, x25, #4\n"
+ "zip1 z23.h, z31.h, z21.h\n"
+ "st1h { z20.h }, p4, [x20, #2, MUL VL]\n"
"addvl x24, x24, #4\n"
"addvl x23, x23, #4\n"
- "st1h { z21.h }, p2, [x20, #1, MUL VL]\n"
- "addvl x22, x22, #4\n"
- "zip1 z23.h, z1.h, z0.h\n"
- "zip2 z22.h, z1.h, z0.h\n"
- "zip1 z21.h, z19.h, z18.h\n"
- "zip2 z20.h, z19.h, z18.h\n"
- "st1h { z29.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z28.h }, p2, [x20, #3, MUL VL]\n"
- "zip1 z19.h, z31.h, z30.h\n"
- "zip2 z18.h, z31.h, z30.h\n"
- "st1h { z17.h }, p2, [x20, #4, MUL VL]\n"
- "zip1 z17.h, z25.h, z16.h\n"
- "zip2 z16.h, z25.h, z16.h\n"
- "st1h { z24.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #6, MUL VL]\n"
- "st1h { z20.h }, p2, [x20, #7, MUL VL]\n"
+ "zip2 z22.h, z31.h, z21.h\n"
+ "st1h { z19.h }, p4, [x20, #3, MUL VL]\n"
+ "zip1 z21.h, z30.h, z27.h\n"
+ "zip2 z20.h, z30.h, z27.h\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z17.h }, p4, [x20, #4, MUL VL]\n"
+ "zip1 z19.h, z29.h, z18.h\n"
+ "zip2 z18.h, z29.h, z18.h\n"
+ "st1h { z16.h }, p4, [x20, #5, MUL VL]\n"
+ "zip1 z17.h, z28.h, z26.h\n"
+ "zip2 z16.h, z28.h, z26.h\n"
+ "st1h { z25.h }, p4, [x20, #6, MUL VL]\n"
+ "st1h { z24.h }, p4, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
- "st1h { z27.h }, p2, [x20, #-8, MUL VL]\n"
- "st1h { z26.h }, p2, [x20, #-7, MUL VL]\n"
- "st1h { z23.h }, p2, [x20, #-6, MUL VL]\n"
- "st1h { z22.h }, p2, [x20, #-5, MUL VL]\n"
- "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
- "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
- "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
- "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+ "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+ "st1h { z22.h }, p4, [x20, #-7, MUL VL]\n"
+ "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+ "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+ "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+ "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+ "st1h { z16.h }, p4, [x20, #-1, MUL VL]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x4\n"
@@ -243,110 +243,110 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t
"mov x28, %x[in]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #8\n"
+ "add x25, x28, %x[in_stride]\n"
"cmp %x[height], #0x1\n"
- "mov x25, %x[out]\n"
- "sub %x[height], %x[height], #0x2\n"
- "add x24, x28, %x[in_stride]\n"
- "add %x[in], x24, %x[in_stride]\n"
- "csel x24, x24, %x[pad_row], GT\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1h { z19.h }, p2/Z, [x28]\n"
- "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z17.h }, p4/Z, [x28]\n"
+ "ld1h { z20.h }, p4/Z, [x28, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1h { z26.h }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1h { z0.h }, p2/Z, [x28, #3, MUL VL]\n"
"cmp x21, x20\n"
- "ld1h { z17.h }, p2/Z, [x24]\n"
- "ld1h { z16.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z25.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z24.h }, p2/Z, [x24, #3, MUL VL]\n"
- "ld1h { z31.h }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1h { z30.h }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1h { z29.h }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1h { z28.h }, p2/Z, [x28, #7, MUL VL]\n"
- "zip1 z23.h, z19.h, z17.h\n"
- "zip2 z22.h, z19.h, z17.h\n"
- "ld1h { z21.h }, p2/Z, [x24, #4, MUL VL]\n"
- "ld1h { z20.h }, p2/Z, [x24, #5, MUL VL]\n"
- "zip1 z19.h, z18.h, z16.h\n"
- "zip2 z18.h, z18.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x24, #6, MUL VL]\n"
- "ld1h { z27.h }, p2/Z, [x24, #7, MUL VL]\n"
- "zip1 z16.h, z26.h, z25.h\n"
- "zip2 z26.h, z26.h, z25.h\n"
- "zip1 z25.h, z0.h, z24.h\n"
- "zip2 z24.h, z0.h, z24.h\n"
- "st1h { z23.h }, p2, [x25]\n"
+ "ld1h { z23.h }, p4/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z19.h }, p4/Z, [x28, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x25]\n"
+ "ld1h { z18.h }, p4/Z, [x25, #1, MUL VL]\n"
+ "zip1 z0.h, z17.h, z16.h\n"
+ "zip2 z22.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p4/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x25, #3, MUL VL]\n"
+ "zip1 z31.h, z20.h, z18.h\n"
+ "zip2 z30.h, z20.h, z18.h\n"
+ "ld1h { z21.h }, p4/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x28, #5, MUL VL]\n"
+ "zip1 z29.h, z23.h, z17.h\n"
+ "zip2 z28.h, z23.h, z17.h\n"
+ "ld1h { z27.h }, p4/Z, [x28, #6, MUL VL]\n"
+ "ld1h { z26.h }, p4/Z, [x28, #7, MUL VL]\n"
+ "zip1 z25.h, z19.h, z16.h\n"
+ "zip2 z24.h, z19.h, z16.h\n"
+ "ld1h { z19.h }, p4/Z, [x25, #4, MUL VL]\n"
+ "ld1h { z18.h }, p4/Z, [x25, #5, MUL VL]\n"
"addvl x28, x28, #8\n"
- "st1h { z22.h }, p2, [x25, #1, MUL VL]\n"
- "addvl x24, x24, #8\n"
- "zip1 z23.h, z31.h, z21.h\n"
- "zip2 z22.h, z31.h, z21.h\n"
- "st1h { z19.h }, p2, [x25, #2, MUL VL]\n"
- "zip1 z21.h, z30.h, z20.h\n"
- "zip2 z20.h, z30.h, z20.h\n"
- "st1h { z18.h }, p2, [x25, #3, MUL VL]\n"
- "zip1 z19.h, z29.h, z17.h\n"
- "zip2 z18.h, z29.h, z17.h\n"
- "st1h { z16.h }, p2, [x25, #4, MUL VL]\n"
- "zip1 z17.h, z28.h, z27.h\n"
- "zip2 z16.h, z28.h, z27.h\n"
- "st1h { z26.h }, p2, [x25, #5, MUL VL]\n"
- "st1h { z25.h }, p2, [x25, #6, MUL VL]\n"
- "st1h { z24.h }, p2, [x25, #7, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
- "st1h { z23.h }, p2, [x25]\n"
- "st1h { z22.h }, p2, [x25, #1, MUL VL]\n"
- "st1h { z21.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z20.h }, p2, [x25, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x25, #4, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #5, MUL VL]\n"
- "st1h { z17.h }, p2, [x25, #6, MUL VL]\n"
- "st1h { z16.h }, p2, [x25, #7, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "zip1 z23.h, z21.h, z19.h\n"
+ "ld1h { z17.h }, p4/Z, [x25, #6, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x25, #7, MUL VL]\n"
+ "st1h { z0.h }, p4, [x22]\n"
+ "addvl x25, x25, #8\n"
+ "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+ "zip2 z22.h, z21.h, z19.h\n"
+ "zip1 z21.h, z20.h, z18.h\n"
+ "st1h { z31.h }, p4, [x22, #2, MUL VL]\n"
+ "zip2 z20.h, z20.h, z18.h\n"
+ "zip1 z19.h, z27.h, z17.h\n"
+ "st1h { z30.h }, p4, [x22, #3, MUL VL]\n"
+ "zip2 z18.h, z27.h, z17.h\n"
+ "zip1 z17.h, z26.h, z16.h\n"
+ "st1h { z29.h }, p4, [x22, #4, MUL VL]\n"
+ "zip2 z16.h, z26.h, z16.h\n"
+ "st1h { z28.h }, p4, [x22, #5, MUL VL]\n"
+ "st1h { z25.h }, p4, [x22, #6, MUL VL]\n"
+ "st1h { z24.h }, p4, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z23.h }, p4, [x22]\n"
+ "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decw x21, ALL, MUL #8\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z22.h }, p0/Z, [x28]\n"
+ "ld1h { z21.h }, p0/Z, [x25]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x25, #1, MUL VL]\n"
"dech x20\n"
- "ld1h { z18.h }, p1/Z, [x28]\n"
- "ld1h { z17.h }, p1/Z, [x24]\n"
- "whilelt p1.h, XZR, x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z17.h }, p0/Z, [x25, #2, MUL VL]\n"
"dech x20\n"
- "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x24, #1, MUL VL]\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z23.h, z18.h, z17.h\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z24.h }, p0/Z, [x28, #3, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "decw x21, ALL, MUL #8\n"
"cmp x21, #0x0\n"
- "ld1h { z18.h }, p1/Z, [x28, #2, MUL VL]\n"
- "ld1h { z17.h }, p1/Z, [x24, #2, MUL VL]\n"
- "zip1 z22.h, z20.h, z16.h\n"
- "zip2 z21.h, z20.h, z16.h\n"
- "ld1h { z20.h }, p0/Z, [x28, #3, MUL VL]\n"
+ "zip1 z16.h, z22.h, z21.h\n"
+ "zip2 z22.h, z22.h, z21.h\n"
"addvl x28, x28, #4\n"
- "ld1h { z16.h }, p0/Z, [x24, #3, MUL VL]\n"
- "addvl x24, x24, #4\n"
- "st1h { z19.h }, p2, [x25]\n"
+ "addvl x25, x25, #4\n"
+ "zip1 z21.h, z20.h, z19.h\n"
+ "zip2 z20.h, z20.h, z19.h\n"
"zip1 z19.h, z18.h, z17.h\n"
"zip2 z18.h, z18.h, z17.h\n"
- "st1h { z23.h }, p2, [x25, #1, MUL VL]\n"
- "zip1 z17.h, z20.h, z16.h\n"
- "zip2 z16.h, z20.h, z16.h\n"
- "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z21.h }, p2, [x25, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x25, #4, MUL VL]\n"
- "st1h { z18.h }, p2, [x25, #5, MUL VL]\n"
- "st1h { z17.h }, p2, [x25, #6, MUL VL]\n"
- "st1h { z16.h }, p2, [x25, #7, MUL VL]\n"
- "add x25, x25, %x[out_stride]\n"
+ "st1h { z16.h }, p4, [x22]\n"
+ "zip1 z17.h, z24.h, z23.h\n"
+ "zip2 z16.h, z24.h, z23.h\n"
+ "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -355,7 +355,7 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t
"12:" // Done
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
index 8ed1879643..1d2c0742ea 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
@@ -44,155 +44,155 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t
"blt 6f\n"
"1:" // Main row loop: Head
"mov x12, %x[in]\n"
- "mov x11, %x[width]\n"
- "cnth x10, ALL, MUL #4\n"
- "mov x9, %x[out]\n"
- "sub %x[height], %x[height], #0x8\n"
- "add x28, x12, %x[in_stride]\n"
- "add x27, x28, %x[in_stride]\n"
- "add x26, x27, %x[in_stride]\n"
- "add x25, x26, %x[in_stride]\n"
- "cmp x11, x10\n"
+ "add x11, x12, %x[in_stride]\n"
+ "add x10, x11, %x[in_stride]\n"
+ "add x9, x10, %x[in_stride]\n"
+ "add x28, x9, %x[in_stride]\n"
+ "mov x27, %x[width]\n"
+ "cnth x26, ALL, MUL #4\n"
+ "add x25, x28, %x[in_stride]\n"
"add x24, x25, %x[in_stride]\n"
"add x23, x24, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add %x[in], x22, %x[in_stride]\n"
+ "cmp x27, x26\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1h { z22.h }, p2/Z, [x12]\n"
- "ld1h { z2.h }, p2/Z, [x12, #1, MUL VL]\n"
- "mov x21, x9\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z23.h }, p2/Z, [x28]\n"
- "ld1h { z5.h }, p2/Z, [x28, #1, MUL VL]\n"
- "mov x20, x9\n"
- "sub x11, x11, x10\n"
- "ld1h { z3.h }, p2/Z, [x27]\n"
- "ld1h { z8.h }, p2/Z, [x27, #1, MUL VL]\n"
- "cmp x11, x10\n"
- "add x9, x9, %x[out_stride]\n"
- "ld1h { z25.h }, p2/Z, [x26]\n"
- "ld1h { z26.h }, p2/Z, [x26, #1, MUL VL]\n"
- "ld1h { z6.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z12.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1h { z21.h }, p2/Z, [x12]\n"
+ "ld1h { z17.h }, p2/Z, [x12, #1, MUL VL]\n"
+ "mov x21, x22\n"
+ "add x22, x22, %x[out_stride]\n"
+ "ld1h { z31.h }, p2/Z, [x11]\n"
+ "ld1h { z5.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "mov x20, x22\n"
+ "sub x27, x27, x26\n"
+ "ld1h { z15.h }, p2/Z, [x10]\n"
+ "ld1h { z28.h }, p2/Z, [x10, #1, MUL VL]\n"
+ "zip1 z24.h, z21.h, z15.h\n"
+ "zip2 z29.h, z21.h, z15.h\n"
+ "ld1h { z6.h }, p2/Z, [x9]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "zip1 z16.h, z31.h, z6.h\n"
+ "zip2 z18.h, z31.h, z6.h\n"
+ "ld1h { z3.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z25.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "zip1 z20.h, z17.h, z28.h\n"
+ "zip1 z7.h, z5.h, z4.h\n"
+ "ld1h { z27.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x11, #3, MUL VL]\n"
+ "zip2 z2.h, z17.h, z28.h\n"
+ "zip2 z19.h, z5.h, z4.h\n"
+ "ld1h { z28.h }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x10, #3, MUL VL]\n"
+ "zip1 z21.h, z24.h, z16.h\n"
+ "zip2 z24.h, z24.h, z16.h\n"
+ "ld1h { z5.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "zip1 z14.h, z29.h, z18.h\n"
+ "zip2 z12.h, z29.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "ld1h { z31.h }, p2/Z, [x28, #1, MUL VL]\n"
+ "zip1 z11.h, z20.h, z7.h\n"
+ "zip2 z13.h, z20.h, z7.h\n"
+ "ld1h { z4.h }, p2/Z, [x25]\n"
+ "ld1h { z26.h }, p2/Z, [x25, #1, MUL VL]\n"
+ "zip1 z15.h, z2.h, z19.h\n"
+ "zip2 z10.h, z2.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x24]\n"
+ "ld1h { z30.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "zip1 z19.h, z18.h, z16.h\n"
+ "zip2 z18.h, z18.h, z16.h\n"
+ "ld1h { z8.h }, p2/Z, [x23]\n"
+ "ld1h { z29.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "zip1 z20.h, z4.h, z8.h\n"
+ "zip2 z0.h, z4.h, z8.h\n"
+ "ld1h { z6.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z8.h }, p2/Z, [x28, #3, MUL VL]\n"
+ "zip1 z23.h, z31.h, z30.h\n"
+ "zip1 z16.h, z26.h, z29.h\n"
+ "ld1h { z9.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z7.h }, p2/Z, [x25, #3, MUL VL]\n"
+ "zip2 z31.h, z31.h, z30.h\n"
+ "zip2 z30.h, z26.h, z29.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x24, #3, MUL VL]\n"
+ "zip1 z29.h, z3.h, z28.h\n"
+ "zip1 z4.h, z27.h, z5.h\n"
+ "zip2 z28.h, z3.h, z28.h\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z27.h, z27.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x23, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x21]\n"
+ "zip1 z21.h, z25.h, z17.h\n"
+ "zip2 z25.h, z25.h, z17.h\n"
+ "cmp x27, x26\n"
+ "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
+ "zip1 z24.h, z22.h, z1.h\n"
+ "zip2 z22.h, z22.h, z1.h\n"
"addvl x12, x12, #4\n"
- "ld1h { z15.h }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1h { z11.h }, p2/Z, [x28, #3, MUL VL]\n"
- "zip1 z18.h, z22.h, z3.h\n"
- "zip2 z24.h, z22.h, z3.h\n"
- "ld1h { z14.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z13.h }, p2/Z, [x27, #3, MUL VL]\n"
- "zip1 z4.h, z23.h, z25.h\n"
- "zip2 z16.h, z23.h, z25.h\n"
- "ld1h { z7.h }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1h { z23.h }, p2/Z, [x26, #3, MUL VL]\n"
- "zip1 z21.h, z2.h, z8.h\n"
- "zip1 z22.h, z5.h, z26.h\n"
- "ld1h { z1.h }, p2/Z, [x25]\n"
- "ld1h { z25.h }, p2/Z, [x25, #1, MUL VL]\n"
- "zip2 z10.h, z2.h, z8.h\n"
- "zip2 z5.h, z5.h, z26.h\n"
- "ld1h { z19.h }, p2/Z, [x24]\n"
- "ld1h { z31.h }, p2/Z, [x24, #1, MUL VL]\n"
- "zip1 z17.h, z18.h, z4.h\n"
- "zip2 z28.h, z18.h, z4.h\n"
- "ld1h { z3.h }, p2/Z, [x23]\n"
- "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
- "zip1 z0.h, z24.h, z16.h\n"
- "zip2 z29.h, z24.h, z16.h\n"
- "ld1h { z2.h }, p2/Z, [x22]\n"
- "ld1h { z16.h }, p2/Z, [x22, #1, MUL VL]\n"
- "zip1 z27.h, z21.h, z22.h\n"
- "zip2 z26.h, z21.h, z22.h\n"
- "ld1h { z8.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x25, #3, MUL VL]\n"
- "zip1 z24.h, z10.h, z5.h\n"
- "zip2 z20.h, z10.h, z5.h\n"
- "ld1h { z30.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z5.h }, p2/Z, [x24, #3, MUL VL]\n"
- "zip1 z21.h, z1.h, z3.h\n"
- "zip2 z9.h, z1.h, z3.h\n"
- "ld1h { z10.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z3.h }, p2/Z, [x23, #3, MUL VL]\n"
- "zip1 z22.h, z19.h, z2.h\n"
- "zip2 z19.h, z19.h, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x22, #3, MUL VL]\n"
- "st1h { z17.h }, p2, [x21]\n"
- "zip1 z17.h, z25.h, z18.h\n"
- "zip2 z25.h, z25.h, z18.h\n"
- "zip1 z18.h, z31.h, z16.h\n"
- "st1h { z28.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x21, #2, MUL VL]\n"
+ "zip1 z17.h, z19.h, z20.h\n"
+ "zip2 z20.h, z19.h, z20.h\n"
+ "addvl x11, x11, #4\n"
+ "st1h { z12.h }, p2, [x21, #3, MUL VL]\n"
+ "zip1 z19.h, z18.h, z0.h\n"
+ "zip2 z18.h, z18.h, z0.h\n"
+ "addvl x10, x10, #4\n"
+ "st1h { z11.h }, p2, [x21, #4, MUL VL]\n"
+ "zip1 z14.h, z23.h, z16.h\n"
+ "zip2 z16.h, z23.h, z16.h\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z13.h }, p2, [x21, #5, MUL VL]\n"
+ "zip1 z23.h, z31.h, z30.h\n"
+ "zip2 z1.h, z31.h, z30.h\n"
"addvl x28, x28, #4\n"
- "zip2 z16.h, z31.h, z16.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "zip1 z0.h, z6.h, z14.h\n"
- "addvl x27, x27, #4\n"
- "st1h { z29.h }, p2, [x21, #3, MUL VL]\n"
- "zip1 z31.h, z15.h, z7.h\n"
- "zip2 z29.h, z6.h, z14.h\n"
- "addvl x26, x26, #4\n"
- "st1h { z27.h }, p2, [x21, #4, MUL VL]\n"
- "zip2 z28.h, z15.h, z7.h\n"
- "zip1 z27.h, z12.h, z13.h\n"
+ "st1h { z15.h }, p2, [x21, #6, MUL VL]\n"
+ "zip1 z0.h, z29.h, z4.h\n"
+ "zip2 z31.h, z29.h, z4.h\n"
"addvl x25, x25, #4\n"
- "st1h { z26.h }, p2, [x21, #5, MUL VL]\n"
- "zip1 z26.h, z11.h, z23.h\n"
- "zip2 z6.h, z12.h, z13.h\n"
+ "st1h { z10.h }, p2, [x21, #7, MUL VL]\n"
+ "addvl x21, x21, #16\n"
+ "zip1 z30.h, z28.h, z27.h\n"
+ "zip2 z29.h, z28.h, z27.h\n"
+ "st1h { z17.h }, p2, [x21, #-8, MUL VL]\n"
+ "zip1 z13.h, z21.h, z24.h\n"
+ "zip2 z27.h, z21.h, z24.h\n"
"addvl x24, x24, #4\n"
- "st1h { z24.h }, p2, [x21, #6, MUL VL]\n"
- "zip2 z24.h, z11.h, z23.h\n"
- "zip1 z23.h, z21.h, z22.h\n"
+ "st1h { z20.h }, p2, [x21, #-7, MUL VL]\n"
+ "zip1 z28.h, z25.h, z22.h\n"
+ "zip2 z25.h, z25.h, z22.h\n"
"addvl x23, x23, #4\n"
- "st1h { z20.h }, p2, [x21, #7, MUL VL]\n"
- "addvl x21, x21, #16\n"
- "zip2 z22.h, z21.h, z22.h\n"
- "zip1 z21.h, z9.h, z19.h\n"
- "zip2 z20.h, z9.h, z19.h\n"
- "zip1 z19.h, z17.h, z18.h\n"
- "addvl x22, x22, #4\n"
- "zip2 z18.h, z17.h, z18.h\n"
- "zip1 z17.h, z25.h, z16.h\n"
- "zip2 z16.h, z25.h, z16.h\n"
- "st1h { z23.h }, p2, [x21, #-8, MUL VL]\n"
- "zip1 z23.h, z0.h, z31.h\n"
- "st1h { z22.h }, p2, [x21, #-7, MUL VL]\n"
- "zip2 z0.h, z0.h, z31.h\n"
- "zip1 z31.h, z29.h, z28.h\n"
- "st1h { z21.h }, p2, [x21, #-6, MUL VL]\n"
- "zip2 z29.h, z29.h, z28.h\n"
- "zip1 z28.h, z27.h, z26.h\n"
- "st1h { z20.h }, p2, [x21, #-5, MUL VL]\n"
- "zip2 z27.h, z27.h, z26.h\n"
- "zip1 z26.h, z6.h, z24.h\n"
- "st1h { z19.h }, p2, [x21, #-4, MUL VL]\n"
- "zip2 z25.h, z6.h, z24.h\n"
- "zip1 z22.h, z8.h, z10.h\n"
- "st1h { z18.h }, p2, [x21, #-3, MUL VL]\n"
- "zip1 z21.h, z30.h, z2.h\n"
- "zip2 z20.h, z8.h, z10.h\n"
- "st1h { z17.h }, p2, [x21, #-2, MUL VL]\n"
- "zip2 z19.h, z30.h, z2.h\n"
- "zip1 z18.h, z4.h, z3.h\n"
- "st1h { z16.h }, p2, [x21, #-1, MUL VL]\n"
- "zip1 z17.h, z5.h, z1.h\n"
- "zip2 z24.h, z4.h, z3.h\n"
- "zip2 z16.h, z5.h, z1.h\n"
- "st1h { z23.h }, p2, [x20]\n"
+ "st1h { z19.h }, p2, [x21, #-6, MUL VL]\n"
+ "zip1 z22.h, z6.h, z2.h\n"
+ "zip1 z21.h, z9.h, z3.h\n"
+ "add x22, x22, %x[out_stride]\n"
+ "st1h { z18.h }, p2, [x21, #-5, MUL VL]\n"
+ "zip2 z20.h, z6.h, z2.h\n"
+ "zip2 z19.h, z9.h, z3.h\n"
+ "st1h { z14.h }, p2, [x21, #-4, MUL VL]\n"
+ "zip1 z18.h, z8.h, z26.h\n"
+ "zip1 z17.h, z7.h, z5.h\n"
+ "st1h { z16.h }, p2, [x21, #-3, MUL VL]\n"
+ "zip2 z24.h, z8.h, z26.h\n"
+ "zip2 z16.h, z7.h, z5.h\n"
+ "st1h { z23.h }, p2, [x21, #-2, MUL VL]\n"
"zip1 z23.h, z22.h, z21.h\n"
- "st1h { z0.h }, p2, [x20, #1, MUL VL]\n"
"zip2 z22.h, z22.h, z21.h\n"
+ "st1h { z1.h }, p2, [x21, #-1, MUL VL]\n"
"zip1 z21.h, z20.h, z19.h\n"
- "st1h { z31.h }, p2, [x20, #2, MUL VL]\n"
"zip2 z20.h, z20.h, z19.h\n"
+ "st1h { z0.h }, p2, [x20]\n"
"zip1 z19.h, z18.h, z17.h\n"
- "st1h { z29.h }, p2, [x20, #3, MUL VL]\n"
"zip2 z18.h, z18.h, z17.h\n"
+ "st1h { z31.h }, p2, [x20, #1, MUL VL]\n"
"zip1 z17.h, z24.h, z16.h\n"
- "st1h { z28.h }, p2, [x20, #4, MUL VL]\n"
"zip2 z16.h, z24.h, z16.h\n"
+ "st1h { z30.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z29.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z13.h }, p2, [x20, #4, MUL VL]\n"
"st1h { z27.h }, p2, [x20, #5, MUL VL]\n"
- "st1h { z26.h }, p2, [x20, #6, MUL VL]\n"
+ "st1h { z28.h }, p2, [x20, #6, MUL VL]\n"
"st1h { z25.h }, p2, [x20, #7, MUL VL]\n"
"addvl x20, x20, #16\n"
"st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
@@ -205,84 +205,84 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t
"st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x11, 5f\n"
+ "cbz x27, 5f\n"
"4:" // Main row loop: Column loop
- "mov x21, x11\n"
- "mov x20, x9\n"
- "decd x11, ALL, MUL #8\n"
- "add x9, x9, %x[out_stride]\n"
- "whilelt p1.h, XZR, x21\n"
- "dech x21\n"
- "whilelt p0.h, XZR, x21\n"
- "cmp x11, #0x0\n"
- "ld1h { z24.h }, p1/Z, [x12]\n"
- "ld1h { z23.h }, p1/Z, [x28]\n"
- "ld1h { z22.h }, p1/Z, [x27]\n"
- "ld1h { z16.h }, p1/Z, [x26]\n"
- "ld1h { z2.h }, p1/Z, [x25]\n"
- "ld1h { z1.h }, p1/Z, [x24]\n"
- "ld1h { z21.h }, p0/Z, [x12, #1, MUL VL]\n"
- "ld1h { z26.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "mov x20, x27\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z17.h }, p1/Z, [x12]\n"
+ "ld1h { z19.h }, p1/Z, [x11]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z24.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x10]\n"
+ "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+ "zip1 z1.h, z17.h, z16.h\n"
+ "zip2 z22.h, z17.h, z16.h\n"
+ "ld1h { z18.h }, p1/Z, [x9]\n"
+ "ld1h { z17.h }, p0/Z, [x9, #1, MUL VL]\n"
+ "zip1 z16.h, z19.h, z18.h\n"
+ "zip2 z19.h, z19.h, z18.h\n"
+ "ld1h { z0.h }, p1/Z, [x28]\n"
+ "ld1h { z31.h }, p0/Z, [x28, #1, MUL VL]\n"
+ "zip1 z25.h, z24.h, z20.h\n"
+ "zip1 z21.h, z23.h, z17.h\n"
+ "ld1h { z30.h }, p1/Z, [x25]\n"
+ "ld1h { z29.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "zip2 z28.h, z24.h, z20.h\n"
+ "zip2 z24.h, z23.h, z17.h\n"
+ "ld1h { z20.h }, p1/Z, [x24]\n"
+ "ld1h { z27.h }, p0/Z, [x24, #1, MUL VL]\n"
+ "mov x20, x22\n"
+ "decd x27, ALL, MUL #8\n"
+ "ld1h { z23.h }, p1/Z, [x23]\n"
+ "ld1h { z26.h }, p0/Z, [x23, #1, MUL VL]\n"
+ "zip1 z18.h, z1.h, z16.h\n"
+ "zip2 z17.h, z1.h, z16.h\n"
+ "zip1 z16.h, z22.h, z19.h\n"
+ "zip2 z19.h, z22.h, z19.h\n"
+ "st1h { z18.h }, p2, [x20]\n"
+ "cmp x27, #0x0\n"
+ "zip1 z22.h, z25.h, z21.h\n"
+ "zip2 z21.h, z25.h, z21.h\n"
+ "st1h { z17.h }, p2, [x20, #1, MUL VL]\n"
"addvl x12, x12, #2\n"
+ "zip1 z25.h, z28.h, z24.h\n"
+ "zip2 z18.h, z28.h, z24.h\n"
+ "st1h { z16.h }, p2, [x20, #2, MUL VL]\n"
+ "addvl x11, x11, #2\n"
+ "zip1 z17.h, z0.h, z20.h\n"
+ "zip1 z16.h, z30.h, z23.h\n"
+ "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+ "addvl x10, x10, #2\n"
+ "zip2 z20.h, z0.h, z20.h\n"
+ "zip2 z19.h, z30.h, z23.h\n"
+ "st1h { z22.h }, p2, [x20, #4, MUL VL]\n"
+ "addvl x9, x9, #2\n"
+ "zip1 z24.h, z31.h, z27.h\n"
+ "zip1 z23.h, z29.h, z26.h\n"
+ "st1h { z21.h }, p2, [x20, #5, MUL VL]\n"
"addvl x28, x28, #2\n"
- "ld1h { z20.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x26, #1, MUL VL]\n"
- "zip1 z18.h, z24.h, z22.h\n"
- "zip1 z17.h, z23.h, z16.h\n"
- "ld1h { z0.h }, p0/Z, [x25, #1, MUL VL]\n"
- "ld1h { z31.h }, p0/Z, [x24, #1, MUL VL]\n"
- "zip2 z25.h, z24.h, z22.h\n"
- "zip2 z16.h, z23.h, z16.h\n"
- "ld1h { z30.h }, p1/Z, [x23]\n"
- "ld1h { z29.h }, p0/Z, [x23, #1, MUL VL]\n"
- "addvl x27, x27, #2\n"
- "addvl x26, x26, #2\n"
- "ld1h { z24.h }, p1/Z, [x22]\n"
- "ld1h { z28.h }, p0/Z, [x22, #1, MUL VL]\n"
- "zip1 z23.h, z21.h, z20.h\n"
- "zip1 z22.h, z26.h, z19.h\n"
- "zip2 z21.h, z21.h, z20.h\n"
- "zip2 z20.h, z26.h, z19.h\n"
+ "zip2 z22.h, z31.h, z27.h\n"
+ "zip2 z21.h, z29.h, z26.h\n"
+ "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
"addvl x25, x25, #2\n"
+ "st1h { z18.h }, p2, [x20, #7, MUL VL]\n"
+ "addvl x20, x20, #16\n"
"addvl x24, x24, #2\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
+ "zip1 z18.h, z17.h, z16.h\n"
"addvl x23, x23, #2\n"
- "addvl x22, x22, #2\n"
- "zip1 z17.h, z25.h, z16.h\n"
- "zip2 z16.h, z25.h, z16.h\n"
- "zip1 z27.h, z23.h, z22.h\n"
- "zip2 z23.h, z23.h, z22.h\n"
- "zip1 z26.h, z21.h, z20.h\n"
- "zip2 z25.h, z21.h, z20.h\n"
- "st1h { z19.h }, p2, [x20]\n"
- "zip1 z22.h, z2.h, z30.h\n"
- "zip1 z21.h, z1.h, z24.h\n"
- "st1h { z18.h }, p2, [x20, #1, MUL VL]\n"
- "zip2 z20.h, z2.h, z30.h\n"
- "zip2 z19.h, z1.h, z24.h\n"
- "st1h { z17.h }, p2, [x20, #2, MUL VL]\n"
- "zip1 z18.h, z0.h, z29.h\n"
- "zip1 z17.h, z31.h, z28.h\n"
- "st1h { z16.h }, p2, [x20, #3, MUL VL]\n"
- "zip2 z24.h, z0.h, z29.h\n"
- "zip2 z16.h, z31.h, z28.h\n"
- "st1h { z27.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z23.h }, p2, [x20, #5, MUL VL]\n"
- "zip1 z23.h, z22.h, z21.h\n"
- "zip2 z22.h, z22.h, z21.h\n"
- "st1h { z26.h }, p2, [x20, #6, MUL VL]\n"
- "zip1 z21.h, z20.h, z19.h\n"
+ "zip2 z17.h, z17.h, z16.h\n"
+ "zip1 z16.h, z20.h, z19.h\n"
+ "st1h { z18.h }, p2, [x20, #-8, MUL VL]\n"
"zip2 z20.h, z20.h, z19.h\n"
- "st1h { z25.h }, p2, [x20, #7, MUL VL]\n"
- "addvl x20, x20, #16\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z24.h, z16.h\n"
- "zip2 z16.h, z24.h, z16.h\n"
- "st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
- "st1h { z22.h }, p2, [x20, #-7, MUL VL]\n"
- "st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
+ "zip1 z19.h, z24.h, z23.h\n"
+ "st1h { z17.h }, p2, [x20, #-7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip2 z18.h, z24.h, z23.h\n"
+ "zip1 z17.h, z22.h, z21.h\n"
+ "st1h { z16.h }, p2, [x20, #-6, MUL VL]\n"
+ "zip2 z16.h, z22.h, z21.h\n"
"st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
"st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
"st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
@@ -297,141 +297,141 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t
"6:" // Main loop skip
"7:" // Tail row loop: Head
"mov x12, %x[in]\n"
+ "add x11, x12, %x[in_stride]\n"
+ "add x10, x11, %x[in_stride]\n"
"mov x21, %x[width]\n"
"cnth x20, ALL, MUL #4\n"
+ "add x9, x10, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x9, %x[out]\n"
- "add x28, x12, %x[in_stride]\n"
- "add x27, x28, %x[in_stride]\n"
- "add x26, x27, %x[in_stride]\n"
- "add %x[in], x26, %x[in_stride]\n"
- "csel x26, x26, %x[pad_row], GT\n"
- "csel x27, x27, %x[pad_row], GE\n"
+ "add %x[in], x9, %x[in_stride]\n"
+ "csel x9, x9, %x[pad_row], GT\n"
+ "csel x10, x10, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x28, x28, %x[pad_row], GT\n"
+ "csel x11, x11, %x[pad_row], GT\n"
"cmp x21, x20\n"
+ "mov x22, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 9f\n"
"8:" // Tail row loop: Unroll column loop
- "ld1h { z18.h }, p2/Z, [x12]\n"
- "ld1h { z30.h }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z17.h }, p2/Z, [x12]\n"
+ "ld1h { z22.h }, p2/Z, [x12, #1, MUL VL]\n"
"sub x21, x21, x20\n"
- "ld1h { z24.h }, p2/Z, [x28]\n"
- "ld1h { z29.h }, p2/Z, [x28, #1, MUL VL]\n"
"cmp x21, x20\n"
- "ld1h { z17.h }, p2/Z, [x27]\n"
- "ld1h { z23.h }, p2/Z, [x27, #1, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x26]\n"
- "ld1h { z22.h }, p2/Z, [x26, #1, MUL VL]\n"
- "ld1h { z21.h }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1h { z19.h }, p2/Z, [x11]\n"
+ "ld1h { z21.h }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x10]\n"
+ "ld1h { z18.h }, p2/Z, [x10, #1, MUL VL]\n"
+ "zip1 z4.h, z17.h, z16.h\n"
+ "zip2 z3.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9]\n"
+ "ld1h { z16.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "zip1 z2.h, z19.h, z17.h\n"
+ "zip2 z1.h, z19.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z24.h }, p2/Z, [x12, #3, MUL VL]\n"
+ "zip1 z0.h, z22.h, z18.h\n"
+ "zip1 z31.h, z21.h, z16.h\n"
+ "ld1h { z20.h }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z19.h }, p2/Z, [x11, #3, MUL VL]\n"
+ "zip2 z30.h, z22.h, z18.h\n"
+ "zip2 z23.h, z21.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x10, #3, MUL VL]\n"
+ "zip1 z22.h, z17.h, z16.h\n"
+ "zip2 z29.h, z17.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "zip1 z21.h, z20.h, z17.h\n"
+ "zip2 z28.h, z20.h, z17.h\n"
+ "zip1 z27.h, z24.h, z18.h\n"
+ "zip1 z26.h, z19.h, z16.h\n"
"addvl x12, x12, #4\n"
- "ld1h { z28.h }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1h { z27.h }, p2/Z, [x28, #3, MUL VL]\n"
- "zip1 z3.h, z18.h, z17.h\n"
- "zip2 z2.h, z18.h, z17.h\n"
- "ld1h { z20.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z19.h }, p2/Z, [x27, #3, MUL VL]\n"
- "zip1 z18.h, z24.h, z16.h\n"
- "zip2 z1.h, z24.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x26, #3, MUL VL]\n"
- "zip1 z26.h, z30.h, z23.h\n"
- "zip1 z25.h, z29.h, z22.h\n"
- "zip2 z24.h, z30.h, z23.h\n"
- "zip2 z23.h, z29.h, z22.h\n"
- "addvl x28, x28, #4\n"
- "addvl x27, x27, #4\n"
- "zip1 z22.h, z21.h, z20.h\n"
- "zip2 z0.h, z21.h, z20.h\n"
- "addvl x26, x26, #4\n"
- "zip1 z21.h, z28.h, z17.h\n"
- "zip2 z31.h, z28.h, z17.h\n"
- "zip1 z30.h, z4.h, z19.h\n"
- "zip1 z29.h, z27.h, z16.h\n"
- "zip2 z28.h, z4.h, z19.h\n"
- "zip2 z27.h, z27.h, z16.h\n"
- "zip1 z20.h, z3.h, z18.h\n"
- "zip2 z19.h, z3.h, z18.h\n"
- "zip1 z18.h, z2.h, z1.h\n"
- "zip2 z17.h, z2.h, z1.h\n"
- "zip1 z16.h, z26.h, z25.h\n"
- "zip2 z26.h, z26.h, z25.h\n"
- "zip1 z25.h, z24.h, z23.h\n"
- "zip2 z24.h, z24.h, z23.h\n"
- "st1h { z20.h }, p2, [x9]\n"
- "st1h { z19.h }, p2, [x9, #1, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "zip2 z25.h, z24.h, z18.h\n"
+ "zip2 z24.h, z19.h, z16.h\n"
+ "addvl x10, x10, #4\n"
+ "addvl x9, x9, #4\n"
+ "zip1 z16.h, z4.h, z2.h\n"
+ "zip2 z17.h, z4.h, z2.h\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "zip1 z16.h, z3.h, z1.h\n"
+ "zip2 z20.h, z3.h, z1.h\n"
+ "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+ "zip1 z19.h, z0.h, z31.h\n"
+ "zip2 z18.h, z0.h, z31.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "zip1 z17.h, z30.h, z23.h\n"
+ "zip2 z16.h, z30.h, z23.h\n"
+ "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
"zip1 z23.h, z22.h, z21.h\n"
"zip2 z22.h, z22.h, z21.h\n"
- "st1h { z18.h }, p2, [x9, #2, MUL VL]\n"
- "zip1 z21.h, z0.h, z31.h\n"
- "zip2 z20.h, z0.h, z31.h\n"
- "st1h { z17.h }, p2, [x9, #3, MUL VL]\n"
- "zip1 z19.h, z30.h, z29.h\n"
- "zip2 z18.h, z30.h, z29.h\n"
- "st1h { z16.h }, p2, [x9, #4, MUL VL]\n"
- "zip1 z17.h, z28.h, z27.h\n"
- "zip2 z16.h, z28.h, z27.h\n"
- "st1h { z26.h }, p2, [x9, #5, MUL VL]\n"
- "st1h { z25.h }, p2, [x9, #6, MUL VL]\n"
- "st1h { z24.h }, p2, [x9, #7, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
- "st1h { z23.h }, p2, [x9]\n"
- "st1h { z22.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z21.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z20.h }, p2, [x9, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x9, #4, MUL VL]\n"
- "st1h { z18.h }, p2, [x9, #5, MUL VL]\n"
- "st1h { z17.h }, p2, [x9, #6, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #7, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+ "zip1 z21.h, z29.h, z28.h\n"
+ "zip2 z20.h, z29.h, z28.h\n"
+ "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+ "zip1 z19.h, z27.h, z26.h\n"
+ "zip2 z18.h, z27.h, z26.h\n"
+ "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "zip1 z17.h, z25.h, z24.h\n"
+ "zip2 z16.h, z25.h, z24.h\n"
+ "st1h { z23.h }, p2, [x22]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bge 8b\n"
"9:" // Tail row loop: Unroll column loop skip
"cbz x21, 11f\n"
"10:" // Tail row loop: Column loop
"mov x20, x21\n"
- "decd x21, ALL, MUL #8\n"
"whilelt p1.h, XZR, x20\n"
+ "ld1h { z23.h }, p1/Z, [x12]\n"
+ "ld1h { z22.h }, p1/Z, [x11]\n"
"dech x20\n"
"whilelt p0.h, XZR, x20\n"
+ "ld1h { z21.h }, p0/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z25.h }, p0/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x10]\n"
+ "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+ "decd x21, ALL, MUL #8\n"
+ "zip1 z24.h, z23.h, z19.h\n"
+ "ld1h { z18.h }, p1/Z, [x9]\n"
+ "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+ "zip1 z17.h, z22.h, z18.h\n"
+ "zip2 z23.h, z23.h, z19.h\n"
+ "zip2 z19.h, z22.h, z18.h\n"
+ "zip1 z22.h, z21.h, z20.h\n"
"cmp x21, #0x0\n"
- "ld1h { z20.h }, p1/Z, [x12]\n"
- "ld1h { z19.h }, p1/Z, [x28]\n"
- "ld1h { z18.h }, p1/Z, [x27]\n"
- "ld1h { z17.h }, p1/Z, [x26]\n"
- "ld1h { z24.h }, p0/Z, [x12, #1, MUL VL]\n"
"addvl x12, x12, #2\n"
- "ld1h { z25.h }, p0/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- "ld1h { z23.h }, p0/Z, [x27, #1, MUL VL]\n"
- "ld1h { z16.h }, p0/Z, [x26, #1, MUL VL]\n"
- "addvl x27, x27, #2\n"
- "addvl x26, x26, #2\n"
- "zip1 z22.h, z20.h, z18.h\n"
- "zip1 z21.h, z19.h, z17.h\n"
- "zip2 z20.h, z20.h, z18.h\n"
- "zip2 z19.h, z19.h, z17.h\n"
- "zip1 z18.h, z24.h, z23.h\n"
- "zip1 z17.h, z25.h, z16.h\n"
- "zip2 z24.h, z24.h, z23.h\n"
- "zip2 z16.h, z25.h, z16.h\n"
- "zip1 z23.h, z22.h, z21.h\n"
- "zip2 z22.h, z22.h, z21.h\n"
- "zip1 z21.h, z20.h, z19.h\n"
- "zip2 z20.h, z20.h, z19.h\n"
- "zip1 z19.h, z18.h, z17.h\n"
- "zip2 z18.h, z18.h, z17.h\n"
- "zip1 z17.h, z24.h, z16.h\n"
- "zip2 z16.h, z24.h, z16.h\n"
- "st1h { z23.h }, p2, [x9]\n"
- "st1h { z22.h }, p2, [x9, #1, MUL VL]\n"
- "st1h { z21.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z20.h }, p2, [x9, #3, MUL VL]\n"
- "st1h { z19.h }, p2, [x9, #4, MUL VL]\n"
- "st1h { z18.h }, p2, [x9, #5, MUL VL]\n"
- "st1h { z17.h }, p2, [x9, #6, MUL VL]\n"
- "st1h { z16.h }, p2, [x9, #7, MUL VL]\n"
- "add x9, x9, %x[out_stride]\n"
+ "zip1 z18.h, z25.h, z16.h\n"
+ "zip2 z21.h, z21.h, z20.h\n"
+ "addvl x11, x11, #2\n"
+ "addvl x10, x10, #2\n"
+ "zip2 z20.h, z25.h, z16.h\n"
+ "addvl x9, x9, #2\n"
+ "zip1 z16.h, z24.h, z17.h\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "zip2 z16.h, z24.h, z17.h\n"
+ "zip1 z17.h, z23.h, z19.h\n"
+ "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+ "zip2 z16.h, z23.h, z19.h\n"
+ "zip1 z19.h, z22.h, z18.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "zip2 z18.h, z22.h, z18.h\n"
+ "zip1 z17.h, z21.h, z20.h\n"
+ "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+ "zip2 z16.h, z21.h, z20.h\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
"bgt 10b\n"
"11:" // Tail row loop: Column loop skip
"cmp %x[height], #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
index 7160077342..ab3af6f88d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
@@ -39,220 +39,220 @@ void sve_transpose_interleave_8VL_2x4_fp32bf16(bfloat16 *out, const float *in, s
size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
__asm__ __volatile__(
- "ptrue p2.b\n"
+ "ptrue p4.b\n"
"1:" // Main row loop: Head
"mov x26, %x[in]\n"
- "mov x25, %x[width]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "mov x23, %x[width]\n"
"cnth x20, ALL, MUL #4\n"
+ "add x22, x24, %x[in_stride]\n"
"cmp %x[height], #0x3\n"
- "mov x24, %x[out]\n"
- "add x23, x26, %x[in_stride]\n"
- "add x22, x23, %x[in_stride]\n"
- "add x21, x22, %x[in_stride]\n"
- "add %x[in], x21, %x[in_stride]\n"
- "csel x21, x21, %x[pad_row], GT\n"
- "csel x22, x22, %x[pad_row], GE\n"
+ "add %x[in], x22, %x[in_stride]\n"
+ "csel x22, x22, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
"cmp %x[height], #0x1\n"
- "csel x23, x23, %x[pad_row], GT\n"
- "cmp x25, x20\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "cmp x23, x20\n"
+ "mov x21, %x[out]\n"
"sub %x[height], %x[height], #0x4\n"
"blt 3f\n"
"2:" // Main row loop: Unroll column loop
- "ld1w { z25.s }, p2/Z, [x26]\n"
- "ld1w { z23.s }, p2/Z, [x26, #1, MUL VL]\n"
- "sub x25, x25, x20\n"
- "ld1w { z5.s }, p2/Z, [x26, #2, MUL VL]\n"
- "ld1w { z31.s }, p2/Z, [x26, #3, MUL VL]\n"
- "cmp x25, x20\n"
- "ld1w { z16.s }, p2/Z, [x22]\n"
- "ld1w { z18.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x22, #3, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x26, #4, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x26, #5, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x26, #6, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x26, #7, MUL VL]\n"
- "zip1 z19.s, z25.s, z16.s\n"
- "zip2 z17.s, z25.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x22, #4, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x22, #5, MUL VL]\n"
- "zip1 z11.s, z23.s, z18.s\n"
- "zip2 z10.s, z23.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x22, #6, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x22, #7, MUL VL]\n"
- "zip1 z9.s, z5.s, z21.s\n"
- "zip2 z8.s, z5.s, z21.s\n"
- "ld1w { z12.s }, p2/Z, [x23]\n"
- "ld1w { z27.s }, p2/Z, [x23, #1, MUL VL]\n"
- "zip1 z7.s, z31.s, z14.s\n"
- "zip2 z6.s, z31.s, z14.s\n"
- "ld1w { z25.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [x23, #3, MUL VL]\n"
- "zip1 z5.s, z3.s, z16.s\n"
- "zip2 z4.s, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "ld1w { z28.s }, p2/Z, [x21, #1, MUL VL]\n"
- "zip1 z3.s, z30.s, z13.s\n"
- "zip2 z2.s, z30.s, z13.s\n"
- "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z24.s }, p2/Z, [x21, #3, MUL VL]\n"
- "zip1 z1.s, z20.s, z18.s\n"
- "zip2 z0.s, z20.s, z18.s\n"
- "ld1w { z21.s }, p2/Z, [x23, #4, MUL VL]\n"
- "ld1w { z20.s }, p2/Z, [x23, #5, MUL VL]\n"
- "zip1 z31.s, z22.s, z15.s\n"
- "zip2 z30.s, z22.s, z15.s\n"
- "ld1w { z14.s }, p2/Z, [x23, #6, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x23, #7, MUL VL]\n"
- ".inst 0x658aaa76 // bfcvt z22.h, p2/M, z19.s\n"
- "zip1 z29.s, z12.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x21, #4, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x21, #5, MUL VL]\n"
- ".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
- "zip2 z12.s, z12.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x21, #6, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x21, #7, MUL VL]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- ".inst 0x658aa94a // bfcvt z10.h, p2/M, z10.s\n"
- ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
- ".inst 0x658aa908 // bfcvt z8.h, p2/M, z8.s\n"
+ "ld1w { z19.s }, p4/Z, [x26]\n"
+ "ld1w { z18.s }, p4/Z, [x26, #1, MUL VL]\n"
+ "sub x23, x23, x20\n"
+ "cmp x23, x20\n"
+ "ld1w { z20.s }, p4/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z23.s }, p4/Z, [x24]\n"
+ "ld1w { z17.s }, p4/Z, [x24, #1, MUL VL]\n"
+ "zip1 z22.s, z19.s, z23.s\n"
+ "zip2 z21.s, z19.s, z23.s\n"
+ "ld1w { z31.s }, p4/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x24, #3, MUL VL]\n"
+ "zip1 z9.s, z18.s, z17.s\n"
+ "zip2 z7.s, z18.s, z17.s\n"
+ "ld1w { z19.s }, p4/Z, [x26, #4, MUL VL]\n"
+ "ld1w { z18.s }, p4/Z, [x26, #5, MUL VL]\n"
+ "zip1 z6.s, z20.s, z31.s\n"
+ "zip2 z5.s, z20.s, z31.s\n"
+ "ld1w { z15.s }, p4/Z, [x26, #6, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x26, #7, MUL VL]\n"
+ "zip1 z3.s, z24.s, z16.s\n"
+ "zip2 z2.s, z24.s, z16.s\n"
+ "ld1w { z16.s }, p4/Z, [x24, #4, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x24, #5, MUL VL]\n"
+ "zip1 z1.s, z19.s, z16.s\n"
+ "zip2 z0.s, z19.s, z16.s\n"
+ "ld1w { z16.s }, p4/Z, [x24, #6, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x24, #7, MUL VL]\n"
+ "zip1 z31.s, z18.s, z17.s\n"
+ "zip2 z30.s, z18.s, z17.s\n"
+ "ld1w { z18.s }, p4/Z, [x25]\n"
+ "ld1w { z17.s }, p4/Z, [x25, #1, MUL VL]\n"
+ "zip1 z29.s, z15.s, z16.s\n"
+ "zip2 z28.s, z15.s, z16.s\n"
+ "ld1w { z16.s }, p4/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p4/Z, [x25, #3, MUL VL]\n"
+ "zip1 z27.s, z20.s, z19.s\n"
+ "zip2 z26.s, z20.s, z19.s\n"
+ "ld1w { z11.s }, p4/Z, [x22]\n"
+ "ld1w { z8.s }, p4/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x658ab2d8 // bfcvt z24.h, p4/M, z22.s\n"
+ "zip1 z25.s, z18.s, z11.s\n"
+ "ld1w { z4.s }, p4/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z22.s }, p4/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x658ab2af // bfcvt z15.h, p4/M, z21.s\n"
+ "zip2 z14.s, z18.s, z11.s\n"
+ "ld1w { z21.s }, p4/Z, [x25, #4, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25, #5, MUL VL]\n"
+ ".inst 0x658ab12d // bfcvt z13.h, p4/M, z9.s\n"
+ "zip1 z12.s, z17.s, z8.s\n"
+ "ld1w { z11.s }, p4/Z, [x25, #6, MUL VL]\n"
+ "ld1w { z10.s }, p4/Z, [x25, #7, MUL VL]\n"
+ ".inst 0x658ab0e9 // bfcvt z9.h, p4/M, z7.s\n"
+ "zip2 z8.s, z17.s, z8.s\n"
+ "ld1w { z19.s }, p4/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z18.s }, p4/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x658ab0c7 // bfcvt z7.h, p4/M, z6.s\n"
+ "zip1 z6.s, z16.s, z4.s\n"
+ "ld1w { z17.s }, p4/Z, [x22, #6, MUL VL]\n"
+ ".inst 0x658ab0a5 // bfcvt z5.h, p4/M, z5.s\n"
+ "zip2 z4.s, z16.s, z4.s\n"
+ "ld1w { z16.s }, p4/Z, [x22, #7, MUL VL]\n"
+ ".inst 0x658ab063 // bfcvt z3.h, p4/M, z3.s\n"
+ ".inst 0x658ab042 // bfcvt z2.h, p4/M, z2.s\n"
"addvl x26, x26, #8\n"
- "addvl x23, x23, #8\n"
- ".inst 0x658aa8e7 // bfcvt z7.h, p2/M, z7.s\n"
- ".inst 0x658aa8c6 // bfcvt z6.h, p2/M, z6.s\n"
+ "addvl x25, x25, #8\n"
+ ".inst 0x658ab021 // bfcvt z1.h, p4/M, z1.s\n"
+ ".inst 0x658ab000 // bfcvt z0.h, p4/M, z0.s\n"
+ "addvl x24, x24, #8\n"
"addvl x22, x22, #8\n"
- "addvl x21, x21, #8\n"
- ".inst 0x658aa8a5 // bfcvt z5.h, p2/M, z5.s\n"
- ".inst 0x658aa884 // bfcvt z4.h, p2/M, z4.s\n"
- ".inst 0x658aa863 // bfcvt z3.h, p2/M, z3.s\n"
- ".inst 0x658aa842 // bfcvt z2.h, p2/M, z2.s\n"
- ".inst 0x658aa821 // bfcvt z1.h, p2/M, z1.s\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
- ".inst 0x658aabde // bfcvt z30.h, p2/M, z30.s\n"
- ".inst 0x648aabb6 // bfcvtnt z22.h, p2/M, z29.s\n"
- "zip1 z29.s, z27.s, z28.s\n"
- "zip2 z28.s, z27.s, z28.s\n"
- "zip1 z27.s, z25.s, z26.s\n"
- "zip2 z26.s, z25.s, z26.s\n"
- "zip1 z25.s, z23.s, z24.s\n"
- "zip2 z24.s, z23.s, z24.s\n"
+ ".inst 0x658ab3ff // bfcvt z31.h, p4/M, z31.s\n"
+ ".inst 0x658ab3de // bfcvt z30.h, p4/M, z30.s\n"
+ ".inst 0x658ab3bd // bfcvt z29.h, p4/M, z29.s\n"
+ ".inst 0x658ab39c // bfcvt z28.h, p4/M, z28.s\n"
+ ".inst 0x658ab37b // bfcvt z27.h, p4/M, z27.s\n"
+ ".inst 0x658ab35a // bfcvt z26.h, p4/M, z26.s\n"
+ ".inst 0x648ab338 // bfcvtnt z24.h, p4/M, z25.s\n"
+ "zip1 z25.s, z23.s, z22.s\n"
+ "st1h { z24.h }, p4, [x21]\n"
+ "zip2 z24.s, z23.s, z22.s\n"
"zip1 z23.s, z21.s, z19.s\n"
- "st1h { z22.h }, p2, [x24]\n"
"zip2 z22.s, z21.s, z19.s\n"
"zip1 z21.s, z20.s, z18.s\n"
"zip2 z20.s, z20.s, z18.s\n"
- "zip1 z19.s, z14.s, z17.s\n"
- "zip2 z18.s, z14.s, z17.s\n"
- "zip1 z17.s, z15.s, z16.s\n"
- "zip2 z16.s, z15.s, z16.s\n"
- ".inst 0x648aa98d // bfcvtnt z13.h, p2/M, z12.s\n"
- ".inst 0x648aabab // bfcvtnt z11.h, p2/M, z29.s\n"
- ".inst 0x648aab8a // bfcvtnt z10.h, p2/M, z28.s\n"
- ".inst 0x648aab69 // bfcvtnt z9.h, p2/M, z27.s\n"
- ".inst 0x648aab48 // bfcvtnt z8.h, p2/M, z26.s\n"
- ".inst 0x648aab27 // bfcvtnt z7.h, p2/M, z25.s\n"
- ".inst 0x648aab06 // bfcvtnt z6.h, p2/M, z24.s\n"
- "st1h { z13.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaae5 // bfcvtnt z5.h, p2/M, z23.s\n"
- ".inst 0x648aaac4 // bfcvtnt z4.h, p2/M, z22.s\n"
- "st1h { z10.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaaa3 // bfcvtnt z3.h, p2/M, z21.s\n"
- ".inst 0x648aaa82 // bfcvtnt z2.h, p2/M, z20.s\n"
- "st1h { z9.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aaa61 // bfcvtnt z1.h, p2/M, z19.s\n"
- ".inst 0x648aaa40 // bfcvtnt z0.h, p2/M, z18.s\n"
- "st1h { z8.h }, p2, [x24, #5, MUL VL]\n"
- ".inst 0x648aaa3f // bfcvtnt z31.h, p2/M, z17.s\n"
- ".inst 0x648aaa1e // bfcvtnt z30.h, p2/M, z16.s\n"
- "st1h { z7.h }, p2, [x24, #6, MUL VL]\n"
- "st1h { z6.h }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
- "st1h { z5.h }, p2, [x24]\n"
- "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z3.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z2.h }, p2, [x24, #3, MUL VL]\n"
- "st1h { z1.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z0.h }, p2, [x24, #5, MUL VL]\n"
- "st1h { z31.h }, p2, [x24, #6, MUL VL]\n"
- "st1h { z30.h }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ "zip1 z19.s, z11.s, z17.s\n"
+ "zip2 z18.s, z11.s, z17.s\n"
+ "zip1 z17.s, z10.s, z16.s\n"
+ "zip2 z16.s, z10.s, z16.s\n"
+ ".inst 0x648ab1cf // bfcvtnt z15.h, p4/M, z14.s\n"
+ "st1h { z15.h }, p4, [x21, #1, MUL VL]\n"
+ ".inst 0x648ab18d // bfcvtnt z13.h, p4/M, z12.s\n"
+ ".inst 0x648ab109 // bfcvtnt z9.h, p4/M, z8.s\n"
+ "st1h { z13.h }, p4, [x21, #2, MUL VL]\n"
+ ".inst 0x648ab0c7 // bfcvtnt z7.h, p4/M, z6.s\n"
+ ".inst 0x648ab085 // bfcvtnt z5.h, p4/M, z4.s\n"
+ "st1h { z9.h }, p4, [x21, #3, MUL VL]\n"
+ ".inst 0x648ab323 // bfcvtnt z3.h, p4/M, z25.s\n"
+ ".inst 0x648ab302 // bfcvtnt z2.h, p4/M, z24.s\n"
+ "st1h { z7.h }, p4, [x21, #4, MUL VL]\n"
+ "st1h { z5.h }, p4, [x21, #5, MUL VL]\n"
+ ".inst 0x648ab2e1 // bfcvtnt z1.h, p4/M, z23.s\n"
+ ".inst 0x648ab2c0 // bfcvtnt z0.h, p4/M, z22.s\n"
+ "st1h { z3.h }, p4, [x21, #6, MUL VL]\n"
+ ".inst 0x648ab2bf // bfcvtnt z31.h, p4/M, z21.s\n"
+ ".inst 0x648ab29e // bfcvtnt z30.h, p4/M, z20.s\n"
+ "st1h { z2.h }, p4, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
+ ".inst 0x648ab27d // bfcvtnt z29.h, p4/M, z19.s\n"
+ ".inst 0x648ab25c // bfcvtnt z28.h, p4/M, z18.s\n"
+ ".inst 0x648ab23b // bfcvtnt z27.h, p4/M, z17.s\n"
+ ".inst 0x648ab21a // bfcvtnt z26.h, p4/M, z16.s\n"
+ "st1h { z1.h }, p4, [x21]\n"
+ "st1h { z0.h }, p4, [x21, #1, MUL VL]\n"
+ "st1h { z31.h }, p4, [x21, #2, MUL VL]\n"
+ "st1h { z30.h }, p4, [x21, #3, MUL VL]\n"
+ "st1h { z29.h }, p4, [x21, #4, MUL VL]\n"
+ "st1h { z28.h }, p4, [x21, #5, MUL VL]\n"
+ "st1h { z27.h }, p4, [x21, #6, MUL VL]\n"
+ "st1h { z26.h }, p4, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bge 2b\n"
"3:" // Main row loop: Unroll column loop skip
- "cbz x25, 5f\n"
+ "cbz x23, 5f\n"
"4:" // Main row loop: Column loop
- "mov x20, x25\n"
- "decd x25, ALL, MUL #8\n"
- "whilelt p1.s, XZR, x20\n"
+ "mov x20, x23\n"
+ "whilelt p3.s, XZR, x20\n"
+ "ld1w { z22.s }, p3/Z, [x26]\n"
+ "ld1w { z21.s }, p3/Z, [x24]\n"
"decw x20\n"
- "whilelt p0.s, XZR, x20\n"
+ "whilelt p2.s, XZR, x20\n"
+ "ld1w { z20.s }, p2/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x24, #1, MUL VL]\n"
"decw x20\n"
- "ld1w { z18.s }, p1/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- "ld1w { z26.s }, p1/Z, [x23]\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
"whilelt p1.s, XZR, x20\n"
+ "ld1w { z18.s }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [x24, #2, MUL VL]\n"
"decw x20\n"
- "ld1w { z20.s }, p0/Z, [x26, #1, MUL VL]\n"
- "ld1w { z17.s }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1w { z24.s }, p0/Z, [x23, #1, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x21, #1, MUL VL]\n"
- "zip1 z19.s, z18.s, z16.s\n"
- "zip2 z16.s, z18.s, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "ld1w { z22.s }, p1/Z, [x26, #2, MUL VL]\n"
+ "ld1w { z28.s }, p0/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z27.s }, p3/Z, [x25]\n"
+ "ld1w { z3.s }, p2/Z, [x25, #1, MUL VL]\n"
+ "zip1 z26.s, z22.s, z21.s\n"
+ "zip2 z25.s, z22.s, z21.s\n"
+ "ld1w { z2.s }, p1/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z1.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "zip1 z24.s, z20.s, z19.s\n"
+ "zip2 z23.s, z20.s, z19.s\n"
+ "ld1w { z22.s }, p3/Z, [x22]\n"
+ "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "zip1 z20.s, z18.s, z17.s\n"
+ "zip2 z19.s, z18.s, z17.s\n"
"ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
- "zip1 z3.s, z26.s, z25.s\n"
- "ld1w { z2.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z1.s }, p1/Z, [x21, #2, MUL VL]\n"
- "zip1 z21.s, z20.s, z17.s\n"
- "zip2 z17.s, z20.s, z17.s\n"
- ".inst 0x658aaa60 // bfcvt z0.h, p2/M, z19.s\n"
- ".inst 0x658aaa1f // bfcvt z31.h, p2/M, z16.s\n"
- "cmp x25, #0x0\n"
- "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
- "ld1w { z16.s }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1w { z30.s }, p0/Z, [x23, #3, MUL VL]\n"
- "zip1 z19.s, z22.s, z18.s\n"
- "zip2 z18.s, z22.s, z18.s\n"
- "ld1w { z29.s }, p0/Z, [x21, #3, MUL VL]\n"
- "zip2 z28.s, z26.s, z25.s\n"
- ".inst 0x658aaabb // bfcvt z27.h, p2/M, z21.s\n"
+ "ld1w { z0.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "zip1 z17.s, z28.s, z16.s\n"
+ "zip2 z16.s, z28.s, z16.s\n"
+ "decd x23, ALL, MUL #8\n"
+ ".inst 0x658ab35f // bfcvt z31.h, p4/M, z26.s\n"
+ "zip1 z30.s, z27.s, z22.s\n"
+ "cmp x23, #0x0\n"
+ ".inst 0x658ab33d // bfcvt z29.h, p4/M, z25.s\n"
+ "zip2 z28.s, z27.s, z22.s\n"
"addvl x26, x26, #4\n"
- "zip1 z26.s, z24.s, z23.s\n"
- ".inst 0x658aaa39 // bfcvt z25.h, p2/M, z17.s\n"
- "addvl x23, x23, #4\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0x658ab31b // bfcvt z27.h, p4/M, z24.s\n"
+ "zip1 z26.s, z3.s, z21.s\n"
+ "addvl x24, x24, #4\n"
"addvl x22, x22, #4\n"
- "zip1 z17.s, z20.s, z16.s\n"
- "zip2 z16.s, z20.s, z16.s\n"
- "addvl x21, x21, #4\n"
- "zip2 z24.s, z24.s, z23.s\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
- "zip1 z22.s, z2.s, z1.s\n"
- ".inst 0x658aaa55 // bfcvt z21.h, p2/M, z18.s\n"
- "zip2 z20.s, z2.s, z1.s\n"
- ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
- "zip1 z18.s, z30.s, z29.s\n"
- ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
- "zip2 z16.s, z30.s, z29.s\n"
- ".inst 0x648aa860 // bfcvtnt z0.h, p2/M, z3.s\n"
- ".inst 0x648aab9f // bfcvtnt z31.h, p2/M, z28.s\n"
- ".inst 0x648aab5b // bfcvtnt z27.h, p2/M, z26.s\n"
- ".inst 0x648aab19 // bfcvtnt z25.h, p2/M, z24.s\n"
- ".inst 0x648aaad7 // bfcvtnt z23.h, p2/M, z22.s\n"
- ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
- ".inst 0x648aaa53 // bfcvtnt z19.h, p2/M, z18.s\n"
- "st1h { z0.h }, p2, [x24]\n"
- ".inst 0x648aaa11 // bfcvtnt z17.h, p2/M, z16.s\n"
- "st1h { z31.h }, p2, [x24, #1, MUL VL]\n"
- "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
- "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
- "st1h { z23.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z21.h }, p2, [x24, #5, MUL VL]\n"
- "st1h { z19.h }, p2, [x24, #6, MUL VL]\n"
- "st1h { z17.h }, p2, [x24, #7, MUL VL]\n"
- "add x24, x24, %x[out_stride]\n"
+ ".inst 0x658ab2f9 // bfcvt z25.h, p4/M, z23.s\n"
+ "zip2 z24.s, z3.s, z21.s\n"
+ ".inst 0x658ab297 // bfcvt z23.h, p4/M, z20.s\n"
+ "zip1 z22.s, z2.s, z18.s\n"
+ ".inst 0x658ab275 // bfcvt z21.h, p4/M, z19.s\n"
+ "zip2 z20.s, z2.s, z18.s\n"
+ ".inst 0x658ab233 // bfcvt z19.h, p4/M, z17.s\n"
+ "zip1 z18.s, z1.s, z0.s\n"
+ ".inst 0x658ab211 // bfcvt z17.h, p4/M, z16.s\n"
+ "zip2 z16.s, z1.s, z0.s\n"
+ ".inst 0x648ab3df // bfcvtnt z31.h, p4/M, z30.s\n"
+ ".inst 0x648ab39d // bfcvtnt z29.h, p4/M, z28.s\n"
+ "st1h { z31.h }, p4, [x21]\n"
+ ".inst 0x648ab35b // bfcvtnt z27.h, p4/M, z26.s\n"
+ ".inst 0x648ab319 // bfcvtnt z25.h, p4/M, z24.s\n"
+ "st1h { z29.h }, p4, [x21, #1, MUL VL]\n"
+ ".inst 0x648ab2d7 // bfcvtnt z23.h, p4/M, z22.s\n"
+ ".inst 0x648ab295 // bfcvtnt z21.h, p4/M, z20.s\n"
+ "st1h { z27.h }, p4, [x21, #2, MUL VL]\n"
+ ".inst 0x648ab253 // bfcvtnt z19.h, p4/M, z18.s\n"
+ ".inst 0x648ab211 // bfcvtnt z17.h, p4/M, z16.s\n"
+ "st1h { z25.h }, p4, [x21, #3, MUL VL]\n"
+ "st1h { z23.h }, p4, [x21, #4, MUL VL]\n"
+ "st1h { z21.h }, p4, [x21, #5, MUL VL]\n"
+ "st1h { z19.h }, p4, [x21, #6, MUL VL]\n"
+ "st1h { z17.h }, p4, [x21, #7, MUL VL]\n"
+ "add x21, x21, %x[out_stride]\n"
"bgt 4b\n"
"5:" // Main row loop: Column loop skip
"cmp %x[height], #0x1\n"
@@ -260,7 +260,7 @@ void sve_transpose_interleave_8VL_2x4_fp32bf16(bfloat16 *out, const float *in, s
"bge 1b\n"
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
- : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index d0a8635604..9d8e31870d 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -176,6 +176,7 @@ namespace utils {
// which then calls SVE kernels (compiled accordingly) iff SVE is detected at runtime.
template <typename T>
inline unsigned long get_vector_length() {
+#if defined(__aarch64__)
uint64_t vl;
__asm __volatile (
@@ -187,26 +188,24 @@ inline unsigned long get_vector_length() {
);
return vl / sizeof(T);
+#else // !defined(__aarch64__)
+ return 16 / sizeof(T);
+#endif // defined(__aarch64__)
}
+#ifdef ARM_COMPUTE_ENABLE_SME
namespace sme {
-template <typename T>
-inline uint64_t get_vector_length() {
- uint64_t raw_vector_length;
-
- __asm __volatile (
- ".inst 0x04bf5821\n" // RDSVL X1, #1
- "mov %0, X1\n"
- : "=r" (raw_vector_length)
- :
- : "x1"
- );
+// function from misc-sve.cpp
+extern unsigned int raw_vector_length();
- return raw_vector_length / sizeof(T);
+template <typename T>
+inline unsigned long get_vector_length() {
+ return raw_vector_length() / sizeof(T);
}
} // namespace sme
+#endif // ARM_COMPUTE_ENABLE_SME
// get_vector_length(VLType): Returns vector length for type "T".
//
@@ -215,17 +214,48 @@ inline uint64_t get_vector_length() {
template <typename T>
inline unsigned long get_vector_length(VLType vl_type) {
switch (vl_type) {
-#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME
case VLType::SME:
return sme::get_vector_length<T>();
+#endif // ARM_COMPUTE_ENABLE_SME
case VLType::SVE:
return get_vector_length<T>();
-#endif
default:
return 16 / sizeof(T);
}
}
+// get_default_activation_values(): Returns the default values for activation min and max for integer activation.
+template <typename T>
+inline std::tuple<T, T> get_default_activation_values()
+{
+ const T min = static_cast<T>(std::numeric_limits<T>::min());
+ const T max = static_cast<T>(std::numeric_limits<T>::max());
+
+ return std::make_tuple(min, max);
+}
+
+// get_default_activation_values(): Returns the default values for activation min and max for float activation.
+template <>
+inline std::tuple<float, float> get_default_activation_values()
+{
+ const float min = static_cast<float>(-std::numeric_limits<float>::infinity());
+ const float max = static_cast<float>(std::numeric_limits<float>::infinity());
+
+ return std::make_tuple(min, max);
+}
+
+#if defined(__ARM_FP16_ARGS)
+// get_default_activation_values(): Returns the default values for activation min and max for __fp16 activation.
+template <>
+inline std::tuple<__fp16, __fp16> get_default_activation_values()
+{
+ const __fp16 min = static_cast<__fp16>(-std::numeric_limits<float>::infinity());
+ const __fp16 max = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+
+ return std::make_tuple(min, max);
+}
+#endif // defined(__ARM_FP16_ARGS)
} // utils namespace
} // arm_gemm namespace